# Part 3 : Topic Modeling
http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html

In [26]:
%matplotlib inline
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from wordcloud import WordCloud
from PIL import Image

import nltk
import string

import re

import itertools
import gensim

import pyLDAvis.gensim as gensimvis
import pyLDAvis

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get the tokenized corpus

First of all, we load the corpus and apply it the preprocessing pipeline :

In [9]:
path = 'hillary-clinton-emails/'
emails = pd.read_csv(path + "Emails.csv")
print(emails.shape)
emails.head()

(7945, 22)


Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
3,4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
4,5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\r\nFriday, March 1...",B6\r\nUNCLASSIFIED\r\nU.S. Department of State...


In [10]:
extracted_data = emails[["ExtractedSubject", "ExtractedBodyText"]]
extracted_data = extracted_data.dropna(how="all") # Remove mails without content (no subject, no body)
extracted_data = extracted_data.fillna('') # Replace NaN values by '' in order to concatenate body and subject
extracted_mails = extracted_data["ExtractedSubject"] + extracted_data["ExtractedBodyText"]

In [None]:
from preprocessing_pipeline import preprocessing
extracted_mails = extracted_mails.apply(lambda x: preprocessing(x)) # Pre-processing pipeline
extracted_mails.to_pickle("corpus_part3") # save it

In [70]:
# Or simply load it
extracted_mails = pd.read_pickle("corpus_part3")

## Dictionary

Then, we build the associated dictonary, which map into raw text tokens (strings) from their numerical ids (integers).

In [71]:
# Build dictonary
%time id2word_mail = gensim.corpora.Dictionary(extracted_mails)

Wall time: 2.13 s


## Vectorization

Given our corpus and our dictionary, we can create bag-of-words vectors :

In [72]:
def iter_emails(mails):
    for mail in mails:
        yield mail

class EmailsCorpus(object):
    
    def __init__(self, dump_file, dictionary, clip_docs=None):
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        for tokens in itertools.islice(iter_emails(self.dump_file), self.clip_docs):
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [73]:
# create a stream of bag-of-words vectors
mails_corpus = EmailsCorpus(extracted_mails, id2word_mail)

Let's store all those bag-of-words vectors into a file :

In [74]:
gensim.corpora.MmCorpus.serialize('SerializedCorpus.mm', mails_corpus)

## Model

Let's proceed to the topic modeling for various number of topics (varying from 5 to 50) :

In [77]:
topics_number = [5, 10, 15, 20, 30, 40, 50]
lda_models = dict()

In [110]:
for n_topic in topics_number:
    lda_models[n_topic] = gensim.models.ldamodel.LdaModel(mails_corpus, num_topics=n_topic, id2word=id2word_mail)

Now, let's look at the topics for the different models :

In [112]:
def topic_display(model):
    topics = model.print_topics(num_topics=-1)
    for topic in topics:
        list_words = topic[1].split('"')
        extract_words = []
        for it, word in enumerate(list_words):
            if it % 2 == 1: # Remove probabilities
                extract_words.append(word)
        print(str(topic[0])+"."+" ".join(extract_words))

In [113]:
for n_topic, model in lda_models.items():
    print("************ MODELING WITH {} TOPICS ************\n".format(n_topic))
    topic_display(model)
    print()

************ MODELING WITH 50 TOPICS ************

0.reuters mubarak jack connect work back ny safe info travel
1.sarkozy tent jones deadline egyptian word nicolas michele '' length
2.yes get back happy thx see check ill holiday reach
3.'' `` tell official corporate think deny give thing sanction
4.statement au hrc aid nice pd blair syria '' spokesman
5.'' mr `` iraq clinton president american last even international
6.secretary office meeting room arrive house private white route depart
7.beck speak update complete weekend karl san work aipac glenn
8.hospital ask email talk minister blair fm russia news quote
9.email receive address system recipient try '' note fe wjc
10.'' `` mr speech get corporation american take gather secret
11.branch eikenberry missile gelb afghan strike pas insurgent convention army
12.confirm final agree lona get plane actually summary raj work
13.mcchrystal afghanistan tory soldier '' troop praise robinson uk russian
14.'' `` ie moscow status onto usa obama k

## Number of topics

To choose our number of topics, we looked at the coherence between words gathered in a same topic :
- if the words aren't linked, we should probably increase our number or topics
- if the words are linked, we may have a good number of topics
- if various topics deals with the same things, we should probably decrease our number of topics

Finally, it is hard to choose the right number of topics... <br/>
However, we choose to keep 30 topics : Even if some topics may be redondant, it will be interesting to see how their similarity is represented. 

## Visualization

In [114]:
SerializedCorpus = gensim.corpora.MmCorpus('SerializedCorpus.mm')
visualization = gensimvis.prepare(lda_models[30], SerializedCorpus, id2word_mail) # model, corpus, dic
pyLDAvis.display(visualization)

In [115]:
pyLDAvis.save_html(visualization,'LDA_Visualization_30_topics.html')