In [2]:
%pylab inline
import pandas as pd
import numpy as np
import sklearn
from matplotlib import pyplot as plt
import nltk
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

Populating the interactive namespace from numpy and matplotlib


In [3]:
dataset = pd.read_csv("Emails.csv")
texts = list(dataset["RawText"])

In [4]:
#tools
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [5]:
#cleaning of texts
_digits = re.compile('\d')
def contains_digits(d):
    return bool(_digits.search(d))

for i, text in enumerate(texts):
    text = re.sub(r'[^\w]', ' ', text)
    words = text.split(" ")
    prepared_words = [word.lower() for word in words if len(word) > 1 and not contains_digits(word)]
    texts[i] = prepared_words
    
connected_texts = [" ".join(text) for text in texts]
ps_texts = [" ".join([ps.stem(word) for word in text]) for text in texts]
lem_texts = [" ".join([lemmatizer.lemmatize(word) for word in text]) for text in texts]

Я очищаю от всех не буквенных символов (знаки препинания, цифры и тд), а также отбрасываю слова короче 2 букв. Также рядом создаю тексты с лемматизированным и стемизированным содержимым

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

#count most frequent bigrams
counter = CountVectorizer(strip_accents='unicode', ngram_range=(2,2), stop_words='english')
A = counter.fit_transform(connected_texts)

A = np.sum(A, axis=0) #sum frequencies in all docs

terms = {} #index to word
for z in counter.vocabulary_:
    terms[counter.vocabulary_[z]] = z
    
for k in xrange(20): #top-20 bigrams
    j = np.unravel_index(A.argmax(), A.shape)[1]
    print terms[j], A[0, j]
    A[0, j] = 0

department state 27076
case doc 26520
state case 26518
unclassified department 26509
doc date 26505
state gov 11055
date release 7479
original message 7233
gov sent 6501
date unclassified 5421
clintonemail com 4887
mills cheryl 4415
abedin huma 4083
subject fw 3494
sullivan jacob 3234
abedinh state 2783
com sent 2537
millscd state 2375
huma abedinh 2316
pm subject 2291


In [7]:
#max pmi collocations
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(texts)
res = finder.score_ngrams(bigram_measures.pmi)
for bigram in res[:20]:
    print " ".join(bigram[0]), bigram[1]

_ali taajw 21.3898943976
_apecrets prg 21.3898943976
_lewda lysched 21.3898943976
_mrsiata nizenmenvarmars 21.3898943976
_reunite arriericll 21.3898943976
_rmuacartatmott rmaamaxatiamasm 21.3898943976
_s_ peciaiassistants 21.3898943976
_secreta ry_of_state_robs_sterling_ba 21.3898943976
a_ maveff 21.3898943976
a_pia ivene 21.3898943976
aaad htmly 21.3898943976
aat hacatiasierchassme 21.3898943976
abba eban 21.3898943976
abdelrahim oshi 21.3898943976
abiola farida 21.3898943976
abis syh 21.3898943976
abot theiweb 21.3898943976
aca demic 21.3898943976
achtnn nepdc 21.3898943976
aciout foreimi 21.3898943976


In [8]:
#extract features for clusterizzation
from sklearn.feature_extraction.text import TfidfVectorizer

counter = TfidfVectorizer(strip_accents='unicode', stop_words='english', max_df=0.9, min_df=0.005)
A = counter.fit_transform(connected_texts).toarray()
B = counter.fit_transform(ps_texts).toarray()
C = counter.fit_transform(lem_texts).toarray()

counter = TfidfVectorizer(strip_accents='unicode', ngram_range=(2,2), stop_words='english', max_df=0.9, min_df=0.005)
D = counter.fit_transform(connected_texts).toarray()
E = counter.fit_transform(ps_texts).toarray()
F = counter.fit_transform(lem_texts).toarray()

features = np.hstack((A, B, C, D, E, F))
print features.shape

(7945, 13416)


In [9]:
#reduce dimensions
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
A = lsa.fit_transform(features)
print features.shape, "->", A.shape

(7945, 13416) -> (7945, 100)


In [15]:
#cluster, num of clusters is 30 (пока что)
n = 30
km = KMeans(n_clusters=n, init='k-means++', max_iter=500, n_init=100)
result = km.fit_predict(A)

In [16]:
#visualize results, print words with max sum of tf-idf in each cluster

counter = CountVectorizer(strip_accents='unicode', ngram_range=(2,2), stop_words='english')

#most n*5 frequent bigrams in all texts
freq_words = set()
res = counter.fit_transform(connected_texts).toarray()
res = np.sum(res, axis=0)
terms = {} #index to word
for z in counter.vocabulary_:
    terms[counter.vocabulary_[z]] = z
for k in xrange(n*5):
    j = np.unravel_index(res.argmax(), res.shape)[0]
    freq_words.add(terms[j])
    res[j] = 0

counter = TfidfVectorizer(strip_accents='unicode', ngram_range=(2,2), stop_words='english')
#most sum of tf-idf bigrams
res = counter.fit_transform(connected_texts).toarray()
res = np.sum(res, axis=0)
terms = {} #index to word
for z in counter.vocabulary_:
    terms[counter.vocabulary_[z]] = z
for k in xrange(n*5):
    j = np.unravel_index(res.argmax(), res.shape)[0]
    freq_words.add(terms[j])
    res[j] = 0
print freq_words

#clusterization visualize
for i in xrange(n):
    print i, ":"
    cluster_texts = []
    for j, text in enumerate(connected_texts):
        if result[j] == i:
            cluster_texts.append(text)
    res = counter.fit_transform(cluster_texts).toarray()
    res = np.sum(res, axis = 0)
    terms = {}
    for z in counter.vocabulary_:
        terms[counter.vocabulary_[z]] = z
    #10 top sum tf-idf words
    k = 0
    while k < 10:
        j = np.unravel_index(res.argmax(), res.shape)[0]
        if not (terms[j] in freq_words):
            print terms[j]
            k += 1
        res[j] = 0
    print

set([u'sent sat', u'release mills', u'millscd state', u'department state', u'cc ses', u'abedin sent', u'june pm', u'abedin huma', u'tuesday september', u'release clintonemail', u'foia waiver', u'state department', u'subject agreement', u'ses o_shift', u'lona valmorou', u'mchale judith', u'state goy', u'state gov', u'jacob sullivanjj', u'huma abedin', u'sent tuesday', u'reines philippe', u'august pm', u'sent subject', u'pm abedinh', u'prime minister', u'jul subject', u'gov cc', u'unclassified department', u'pm subject', u'lauren jilotylc', u'pm secretary', u'message clintonemail', u'release sent', u'jacob sent', u'benghazi comm', u'state case', u'september pm', u'message mailto', u'sent thursday', u'aug subject', u'verma richard', u'agreement sensitive', u'cheryl sent', u'information redactions', u'select benghazi', u'huma sent', u'sent fri', u'subject speech', u'subject holbrooke', u'gov subject', u'gov huma', u'muscatine lissa', u'pm depart', u'northern ireland', u'valmoro lona', u'do

Кластеры уже похожи на интерпретируемые.

Пусть ассесоры просматривают письма из кластера и отвечают на вопрос: "письмам из кластера можно назначить общую тему?"
0 - нельзя
0.5 - нет сильной уверенности, что предложенная ассесорам тема их объеденяет
1 - ассесор уверен в теме, которую назначил кластеру

Результаты ассесора: 0.82