In [16]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import nltk
import pycountry
from sklearn.feature_extraction.text import CountVectorizer
import random
import requests
import json

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 3. Topic Modeling



## Load emails


In [2]:
emails = pd.read_csv('./hillary-clinton-emails/Emails.csv')
emails.head(2)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,MetadataDocumentClass,ExtractedSubject,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,HRC_Email_296,FW: Wow,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,HRC_Email_296,,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...


## Extract texts and process them

In [123]:
import warnings
warnings.filterwarnings('ignore')

# concat all the emails into one string
from cleaner import *
%autoreload 2

# initiate Cleaner class
cleaner = Cleaner()
clean_text_series = cleaner.cleaning_pipeline_series(emails.ExtractedBodyText.dropna())
clean_text_series.head(5)

1    [thursday, march, 2011, latest, syria, aid, qa...
2                                                [thx]
4    [friday, march, 2011, huma, abedin, latest, sy...
5    [pi, print, wednesday, septemb, 2012, russorv,...
7    [friday, march, 2011, huma, abedin, latest, sy...
Name: ExtractedBodyText, dtype: object

## Topics modeling (and choosing the number of topics)

In [124]:
from classifier import Classifier

# initiate gensim classifier 
# all code are accessible in classifier.py
classifier = Classifier()
classifier.define_dictionary(clean_text_series)

[INFO] Dictionary defined
[INFO] Corpus defined


In [127]:
# create model with various number of topics

iter = 10
n_topics = [5, 10, 15, 20, 30, 40, 50]
list_topics = []
for n in n_topics:
    classifier.define_model(clean_text_series, n, iter)
    t = classifier.return_model()
    list_topics.append(t)

[INFO] LDA model defined
[INFO] define_model took 71.624 s
[INFO] LDA model defined
[INFO] define_model took 82.687 s
[INFO] LDA model defined
[INFO] define_model took 87.315 s
[INFO] LDA model defined
[INFO] define_model took 89.621 s
[INFO] LDA model defined
[INFO] define_model took 99.242 s
[INFO] LDA model defined
[INFO] define_model took 107.404 s
[INFO] LDA model defined
[INFO] define_model took 118.791 s


In [128]:
# print these model (i.e. the most frequent words contained in the model)

def get_word(string):
    ret = []
    list_str = string.split('"')
    for i,word in enumerate(list_str):
        if i % 2 == 1:
            ret.append(word)
    return " ".join(ret)

for topics in list_topics:
    print("===============")
    print("Modeling with ", len(topics), "topics")
    for topic in topics:
        print(str(topic[0]) + ".", get_word(topic[1]))

Modeling with  5 topics
0. state doc 2015 benghazi case inform date depart subject dept
1. call fyi 2010 work talk pl tomorrow state gov today
2. state obama american govern presid peopl work year time nation
3. parti vote democrat republican senat elect voter conserv labour poll
4. secretari offic depart state meet room arriv rout privat confer
Modeling with  10 topics
0. richard email book cdm read ill wjc offic reach travel
1. iran print iranian pl taab border germani report revolutionari miliband
2. call 2010 talk tomorrow 2009 cheryl today work thx back
3. obama presid american republican polici polit democrat clinton hous state
4. settlement govern isra year israel peac parti polit day palestinian
5. fyi state secur unit govern militari effort forc afghanistan nation
6. secretari offic depart meet room state arriv rout privat confer
7. haiti haitian peopl work plan ingo latrin govern port committe
8. women work parti vote elect week year time health support
9. state gov doc 2015 

## Chose the number of topics

There is no "formal" way to chose the right number of topics existing in a corpus. The best we can do is, considering a model, to look at the most important words of each topic.

* If we see some unrelated terms merged into one topic, for example `iran` and `haiti`, then it's probable that two topics were merged. Then we should increase the number of topics in our model.
* If, on the other side, some related terms are split into two topics, for example `iran`and `Tehran`, then it's probable that one topic were split. Then we should, obviously, reduce their number.

These two cases are obviously *too good to be true* and it's often hard to decide the exact number of topics.

Regarding the results above, **around 15 topics seems to be a good approximation.**

## Visualization

In [129]:
# train again our model and the choosen number of topic
classifier.define_model(clean_text_series, 15, iter)

[INFO] LDA model defined
[INFO] define_model took 86.342 s


In [131]:
# visualize it using gensimVis

import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(classifier.lda_model, classifier.corpus, classifier.dictionary)
pyLDAvis.display(vis_data)