# Question 3

# Importing and Preprocessing

In [1]:
import pandas as pd

from gensim import corpora, models

import pprint as pp

import preprocess



In [2]:
filename = './hillary-clinton-emails/Emails.csv'
df_emails = pd.read_csv(filename)
print(df_emails.columns)
df_emails.head(2)

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')


Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...


In [3]:
df_emails['ProcessedText'] = df_emails.apply(lambda row: preprocess.processText(row.RawText), axis=1)
df_emails[['ProcessedText','RawText']].head()

Unnamed: 0,ProcessedText,RawText
0,"[wonderful, strong, moving, statement, bos, pl...",UNCLASSIFIED\nU.S. Department of State\nCase N...
1,"[thursday, march, latest, syria, aiding, qadda...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,"[thx, original, message, former, colleague, te...",UNCLASSIFIED\nU.S. Department of State\nCase N...
3,"[mill, cheryl, subject, cairo, condemnation, f...",UNCLASSIFIED\nU.S. Department of State\nCase N...
4,"[hrod17, clintonemail, com, friday, march, hum...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


In [4]:
texts = df_emails.ProcessedText.tolist()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Extracting topics with LdaModel

In [5]:
def extractTopics(nTopics=5):
    ldamodel = models.LdaMulticore(corpus, num_topics=nTopics, id2word = dictionary, passes=20, workers=7)
    
    for topic in ldamodel.print_topics(num_topics=nTopics, num_words=10):
        print(topic)
    
    return ldamodel

In [6]:
model5 = extractTopics(nTopics=5)

(0, '0.025*"message" + 0.023*"original" + 0.014*"will" + 0.011*"can" + 0.010*"call" + 0.007*"state" + 0.007*"know" + 0.005*"just" + 0.005*"secretary" + 0.005*"get"')
(1, '0.008*"israel" + 0.008*"state" + 0.006*"israeli" + 0.005*"palestinian" + 0.005*"http" + 0.005*"american" + 0.004*"will" + 0.004*"com" + 0.004*"said" + 0.003*"peace"')
(2, '0.010*"will" + 0.005*"state" + 0.005*"government" + 0.005*"country" + 0.004*"new" + 0.004*"people" + 0.004*"said" + 0.004*"can" + 0.004*"year" + 0.003*"also"')
(3, '0.009*"obama" + 0.008*"pm" + 0.008*"said" + 0.007*"president" + 0.006*"secretary" + 0.006*"house" + 0.005*"office" + 0.005*"state" + 0.005*"republican" + 0.004*"one"')
(4, '0.008*"will" + 0.007*"government" + 0.006*"libya" + 0.006*"source" + 0.006*"al" + 0.005*"minister" + 0.005*"libyan" + 0.005*"said" + 0.005*"security" + 0.004*"president"')


<gensim.models.ldamulticore.LdaMulticore at 0x7f4d6f8b2588>

In [7]:
model10 = extractTopics(nTopics=10)

(0, '0.008*"will" + 0.006*"country" + 0.006*"china" + 0.004*"state" + 0.004*"disease" + 0.004*"said" + 0.004*"http" + 0.004*"year" + 0.004*"berlusconi" + 0.003*"min"')
(1, '0.008*"obama" + 0.006*"republican" + 0.006*"said" + 0.005*"will" + 0.005*"party" + 0.005*"president" + 0.004*"one" + 0.004*"house" + 0.004*"year" + 0.004*"time"')
(2, '0.010*"will" + 0.009*"haiti" + 0.008*"state" + 0.006*"development" + 0.005*"country" + 0.005*"need" + 0.004*"government" + 0.004*"united" + 0.004*"year" + 0.004*"international"')
(3, '0.010*"government" + 0.008*"president" + 0.008*"will" + 0.006*"state" + 0.005*"libyan" + 0.005*"libya" + 0.005*"new" + 0.005*"minister" + 0.005*"security" + 0.004*"said"')
(4, '0.007*"government" + 0.007*"will" + 0.006*"el" + 0.006*"deal" + 0.006*"minister" + 0.006*"said" + 0.006*"source" + 0.005*"keib" + 0.005*"party" + 0.005*"power"')
(5, '0.008*"state" + 0.006*"http" + 0.006*"com" + 0.005*"american" + 0.005*"said" + 0.005*"government" + 0.005*"security" + 0.004*"www" 

<gensim.models.ldamulticore.LdaMulticore at 0x7f4d6f8bec18>

In [None]:
model20 = extractTopics(nTopics=20)

(0, '0.011*"magariaf" + 0.010*"source" + 0.008*"libya" + 0.007*"state" + 0.007*"will" + 0.007*"government" + 0.007*"libyan" + 0.006*"al" + 0.005*"security" + 0.005*"also"')
(1, '0.008*"president" + 0.008*"position" + 0.005*"one" + 0.005*"secretary" + 0.005*"filled" + 0.005*"time" + 0.005*"election" + 0.004*"hispanic" + 0.004*"will" + 0.004*"north"')
(2, '0.009*"message" + 0.009*"call" + 0.007*"will" + 0.007*"original" + 0.007*"embassy" + 0.006*"state" + 0.006*"email" + 0.005*"classified" + 0.005*"zelaya" + 0.005*"do"')
(3, '0.012*"http" + 0.012*"com" + 0.009*"www" + 0.007*"secretary" + 0.007*"amazon" + 0.007*"will" + 0.005*"image" + 0.005*"ref" + 0.005*"said" + 0.004*"film"')
(4, '0.011*"state" + 0.007*"pakistan" + 0.006*"percent" + 0.005*"woman" + 0.005*"will" + 0.005*"said" + 0.004*"year" + 0.004*"republican" + 0.004*"policy" + 0.004*"foreign"')
(5, '0.011*"will" + 0.008*"de" + 0.006*"military" + 0.005*"said" + 0.004*"war" + 0.004*"honduras" + 0.004*"blair" + 0.004*"government" + 0.0

<gensim.models.ldamulticore.LdaMulticore at 0x7f4d6f8aaac8>

In [None]:
model50 = extractTopics(nTopics=50)