In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import gensim
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
%matplotlib inline

In [2]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models, matutils


Importing all Presidential speeches and appending them to a dataframe

In [3]:
from os import path
from pathlib import Path
import os
import glob
from collections import defaultdict
root=r'C:\Users\hartg\Desktop\Professional_Projects\Professional-Projects\MSF_Case_Study\Corpus of Presential Speeches\combined'
files=os.listdir(root)

#load speeches into a list

docs=list()
for file in files:
    with open(path.join(root,file),'r',encoding='utf-8') as fd:
        txt=fd.read()
        docs.append(txt)

Function that "cleans" the speeches. Re (regular expression) replaces special characters with, in this case, nothing. This will remove the <title=> characters, etc

In [4]:
import re
def clean(doc):
    doc = re.sub(r'[^\w\s]*','',doc)
    doc = re.sub(r'[\s]+', ' ', doc)
    doc = doc.lower().strip()
    return doc

clean the documents associated with the presidential speeches

In [5]:
clean_docs=list()
for doc in docs:
    doc = clean(doc)
    clean_docs.append(doc)

In [6]:
speeches_df=pd.DataFrame(docs)
speeches_df.columns=['text']

In [7]:
speeches_df.head()

Unnamed: 0,text
0,"<title=""Special Session Message to Congress"">\..."
1,"<title=""Inaugural Address"">\n<date=""March 4, 1..."
2,"<title=""Second Annual Message"">\n<date=""Decemb..."
3,"<title=""Proclamation of Day of Fasting, Humili..."
4,"<title=""Third Annual Message"">\n<date=""Decembe..."


In [8]:
new_df=speeches_df['text'].str.split("=",n=1,expand=True)
new_df.columns=['title_bad','title_actual']
new_df.head()

Unnamed: 0,title_bad,title_actual
0,<title,"""Special Session Message to Congress"">\n<date=..."
1,<title,"""Inaugural Address"">\n<date=""March 4, 1797"">\n..."
2,<title,"""Second Annual Message"">\n<date=""December 8, 1..."
3,<title,"""Proclamation of Day of Fasting, Humiliation a..."
4,<title,"""Third Annual Message"">\n<date=""December 3, 17..."


In [9]:
new_df2=new_df['title_actual'].str.split("=",n=1,expand=True)
new_df2.columns=['title','date_and_text']
new_df2.head()

Unnamed: 0,title,date_and_text
0,"""Special Session Message to Congress"">\n<date","""May 16, 1797"">\nThe personal inconveniences t..."
1,"""Inaugural Address"">\n<date","""March 4, 1797"">\nWhen it was first perceived,..."
2,"""Second Annual Message"">\n<date","""December 8, 1798"">\nGentlemen of the Senate a..."
3,"""Proclamation of Day of Fasting, Humiliation a...","""March 23, 1798"">\nAs the safety and prosperit..."
4,"""Third Annual Message"">\n<date","""December 3, 1799"">\nIt is with peculiar satis..."


In [10]:
new_df3=new_df2['date_and_text'].str.split(">",n=1,expand=True)
new_df3.columns=['date','text']

In [11]:
new_df3.head()

Unnamed: 0,date,text
0,"""May 16, 1797""",\nThe personal inconveniences to the members o...
1,"""March 4, 1797""","\nWhen it was first perceived, in early times,..."
2,"""December 8, 1798""",\nGentlemen of the Senate and Gentlemen of the...
3,"""March 23, 1798""",\nAs the safety and prosperity of nations ulti...
4,"""December 3, 1799""",\nIt is with peculiar satisfaction that I meet...


In [12]:
speeches=pd.merge(new_df2,new_df3,left_index=True,right_index=True)

In [13]:
speeches.head()

Unnamed: 0,title,date_and_text,date,text
0,"""Special Session Message to Congress"">\n<date","""May 16, 1797"">\nThe personal inconveniences t...","""May 16, 1797""",\nThe personal inconveniences to the members o...
1,"""Inaugural Address"">\n<date","""March 4, 1797"">\nWhen it was first perceived,...","""March 4, 1797""","\nWhen it was first perceived, in early times,..."
2,"""Second Annual Message"">\n<date","""December 8, 1798"">\nGentlemen of the Senate a...","""December 8, 1798""",\nGentlemen of the Senate and Gentlemen of the...
3,"""Proclamation of Day of Fasting, Humiliation a...","""March 23, 1798"">\nAs the safety and prosperit...","""March 23, 1798""",\nAs the safety and prosperity of nations ulti...
4,"""Third Annual Message"">\n<date","""December 3, 1799"">\nIt is with peculiar satis...","""December 3, 1799""",\nIt is with peculiar satisfaction that I meet...


In [14]:
speeches.drop(['date_and_text'],axis=1,inplace=True)

In [15]:
speeches['title']=speeches.title.str.replace('>\n<date','',regex=True)
speeches.head()

Unnamed: 0,title,date,text
0,"""Special Session Message to Congress""","""May 16, 1797""",\nThe personal inconveniences to the members o...
1,"""Inaugural Address""","""March 4, 1797""","\nWhen it was first perceived, in early times,..."
2,"""Second Annual Message""","""December 8, 1798""",\nGentlemen of the Senate and Gentlemen of the...
3,"""Proclamation of Day of Fasting, Humiliation a...","""March 23, 1798""",\nAs the safety and prosperity of nations ulti...
4,"""Third Annual Message""","""December 3, 1799""",\nIt is with peculiar satisfaction that I meet...


In [16]:
speeches['date']=speeches.date.str.replace('"','',regex=True)
speeches.head()

Unnamed: 0,title,date,text
0,"""Special Session Message to Congress""","May 16, 1797",\nThe personal inconveniences to the members o...
1,"""Inaugural Address""","March 4, 1797","\nWhen it was first perceived, in early times,..."
2,"""Second Annual Message""","December 8, 1798",\nGentlemen of the Senate and Gentlemen of the...
3,"""Proclamation of Day of Fasting, Humiliation a...","March 23, 1798",\nAs the safety and prosperity of nations ulti...
4,"""Third Annual Message""","December 3, 1799",\nIt is with peculiar satisfaction that I meet...


In [17]:
speeches['text']=speeches.text.str.replace("\n",'',regex=True)
speeches.head()

Unnamed: 0,title,date,text
0,"""Special Session Message to Congress""","May 16, 1797",The personal inconveniences to the members of ...
1,"""Inaugural Address""","March 4, 1797","When it was first perceived, in early times, t..."
2,"""Second Annual Message""","December 8, 1798",Gentlemen of the Senate and Gentlemen of the H...
3,"""Proclamation of Day of Fasting, Humiliation a...","March 23, 1798",As the safety and prosperity of nations ultima...
4,"""Third Annual Message""","December 3, 1799",It is with peculiar satisfaction that I meet t...


Dropping indeces 208,801, and 814 as they were not appropriately transformed upon import

In [18]:
speeches.drop([208,801,814],axis=0,inplace=True)

In [19]:
speeches['date']=pd.to_datetime(speeches['date'])

In [20]:
speeches['text']=speeches['text'].str.lower()

Loacing stopwords from NLTK

In [21]:
from nltk.corpus import stopwords

In [22]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags,strip_punctuation,strip_numeric,stem_text
from gensim.parsing.preprocessing import strip_multiple_whitespaces,strip_non_alphanum,remove_stopwords,strip_short

In [23]:
Filtering=[lambda x: x.lower(),strip_multiple_whitespaces,
          strip_numeric,
          remove_stopwords,
          strip_short,
          stem_text]

In [24]:

def preprocess(text):
    result=[]
    for token in preprocess_string(text,Filtering):
        result.append(token)
    return result
speeches['processed_speech']=speeches['text'].map(preprocess)

In [25]:
from gensim import corpora,models,similarities
dictionary=corpora.Dictionary(speeches.processed_speech)

In [26]:
corpus=[dictionary.doc2bow(text) for text in speeches.processed_speech]

Building a TF-IDF Model (Term Frequency Inverse Document Frequency)

In [28]:
tfidf=models.TfidfModel(corpus)
transformed_tfidf = tfidf[corpus]

LDA (Latent Dirichlet Allocation) on TF-IDF

In [30]:
%time lda = models.LdaMulticore(transformed_tfidf,num_topics=10,id2word=dictionary)

Wall time: 13.7 s


In [31]:
lda.show_topics()

[(0,
  '0.000*"shall" + 0.000*"law" + 0.000*"congress" + 0.000*"constitut" + 0.000*"author" + 0.000*"public" + 0.000*"offic" + 0.000*"militari" + 0.000*"war" + 0.000*"duti"'),
 (1,
  '0.000*"constitut" + 0.000*"shall" + 0.000*"treati" + 0.000*"indian" + 0.000*"it\'" + 0.000*"public" + 0.000*"health" + 0.000*"duti" + 0.000*"feder" + 0.000*"american"'),
 (2,
  '0.000*"<applause.>" + 0.000*"help" + 0.000*"it\'" + 0.000*"andwherea" + 0.000*"american" + 0.000*"law" + 0.000*"vessel" + 0.000*"shall" + 0.000*"treati" + 0.000*"congress"'),
 (3,
  '0.000*"treati" + 0.000*"shall" + 0.000*"constitut" + 0.000*"indian" + 0.000*"soviet" + 0.000*"public" + 0.000*"tax" + 0.000*"congress" + 0.000*"duti" + 0.000*"subject"'),
 (4,
  '0.000*"chines" + 0.000*"treati" + 0.000*"constitut" + 0.000*"tax" + 0.000*"soviet" + 0.000*"mr." + 0.000*"increas" + 0.000*"law" + 0.000*"feder" + 0.000*"congress"'),
 (5,
  '0.000*"territori" + 0.000*"treati" + 0.000*"shall" + 0.000*"parti" + 0.000*"congress" + 0.000*"busi" 

In [37]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda,corpus,dictionary)
vis

In [41]:
count = 1
for doc in speeches['processed_speech']:
    vec=dictionary.doc2bow(doc)
    print("Speech # ",count, lda[vec])
    count=count+1

Speech #  1 [(3, 0.8838069), (7, 0.11553908)]
Speech #  2 [(3, 0.98998266)]
Speech #  3 [(3, 0.6831239), (7, 0.31594944)]
Speech #  4 [(3, 0.28671747), (4, 0.61291194), (7, 0.097839475)]
Speech #  5 [(3, 0.9986008)]
Speech #  6 [(3, 0.13738973), (7, 0.86167806)]
Speech #  7 [(3, 0.3401968), (5, 0.1517577), (7, 0.50653446)]
Speech #  8 [(7, 0.60146964), (9, 0.39213252)]
Speech #  9 [(3, 0.6009026), (7, 0.36237934), (9, 0.0354416)]
Speech #  10 [(3, 0.64022154), (4, 0.25543782), (7, 0.103747636)]
Speech #  11 [(0, 0.5563447), (3, 0.16721055), (7, 0.2729308)]
Speech #  12 [(3, 0.64785135), (7, 0.35195917)]
Speech #  13 [(3, 0.9980529)]
Speech #  14 [(3, 0.67021245), (6, 0.32737908)]
Speech #  15 [(0, 0.15727095), (3, 0.5560917), (7, 0.28560114)]
Speech #  16 [(3, 0.6456908), (7, 0.35410687)]
Speech #  17 [(3, 0.6269874), (4, 0.017786946), (7, 0.3550295)]
Speech #  18 [(3, 0.6734939), (7, 0.32460102)]
Speech #  19 [(3, 0.08429246), (7, 0.91013443)]
Speech #  20 [(3, 0.8835093), (7, 0.11545

Speech #  157 [(3, 0.1675559), (7, 0.8321787)]
Speech #  158 [(0, 0.11910989), (3, 0.059889425), (7, 0.81999564)]
Speech #  159 [(3, 0.07391066), (7, 0.76112133), (9, 0.16411966)]
Speech #  160 [(0, 0.17129016), (3, 0.39841947), (7, 0.42680016)]
Speech #  161 [(0, 0.18439125), (3, 0.08597957), (7, 0.7286759)]
Speech #  162 [(0, 0.068875425), (3, 0.0720162), (4, 0.010582029), (7, 0.8476134)]
Speech #  163 [(3, 0.1500754), (4, 0.010040801), (7, 0.8396564)]
Speech #  164 [(3, 0.07779041), (7, 0.921997)]
Speech #  165 [(3, 0.12878531), (5, 0.21256499), (7, 0.65716857)]
Speech #  166 [(3, 0.6962911), (7, 0.3034102)]
Speech #  167 [(1, 0.027246984), (3, 0.4368676), (7, 0.535091)]
Speech #  168 [(3, 0.06326365), (7, 0.9364019)]
Speech #  169 [(3, 0.4730551), (4, 0.06612619), (7, 0.46035945)]
Speech #  170 [(3, 0.605301), (4, 0.03358042), (7, 0.36087236)]
Speech #  171 [(3, 0.12748463), (7, 0.8720372)]
Speech #  172 [(3, 0.37984192), (4, 0.011502708), (7, 0.60826904)]
Speech #  173 [(3, 0.5909

Speech #  322 [(3, 0.197413), (7, 0.31096435), (9, 0.49013454)]
Speech #  323 [(3, 0.12554403), (7, 0.85455614), (9, 0.017819544)]
Speech #  324 [(7, 0.9914065)]
Speech #  325 [(0, 0.107869625), (3, 0.43030918), (7, 0.4612919)]
Speech #  326 [(3, 0.5547497), (6, 0.095751196), (7, 0.34711036)]
Speech #  327 [(3, 0.7875187), (7, 0.21214722)]
Speech #  328 [(3, 0.17313369), (7, 0.4762457), (8, 0.3471195)]
Speech #  329 [(7, 0.99619323)]
Speech #  330 [(1, 0.39389947), (3, 0.15193191), (7, 0.4511077)]
Speech #  331 [(6, 0.84903175), (7, 0.14758226)]
Speech #  332 [(3, 0.46892068), (5, 0.050313853), (6, 0.0170832), (7, 0.46332845)]
Speech #  333 [(3, 0.028635256), (5, 0.7046654), (7, 0.26377857)]
Speech #  334 [(2, 0.56932396), (3, 0.32577562), (7, 0.10183114)]
Speech #  335 [(3, 0.036402553), (6, 0.8413576), (7, 0.11935179)]
Speech #  336 [(3, 0.19913077), (5, 0.7595239), (7, 0.03846193)]
Speech #  337 [(2, 0.4460565), (3, 0.14986646), (7, 0.40101638)]
Speech #  338 [(2, 0.46067172), (3, 0

Speech #  482 [(2, 0.019813687), (3, 0.15805487), (6, 0.06802076), (7, 0.7539654)]
Speech #  483 [(3, 0.11484413), (6, 0.16631164), (7, 0.70624554), (8, 0.012445755)]
Speech #  484 [(3, 0.8018448), (7, 0.19574523)]
Speech #  485 [(3, 0.24857576), (5, 0.25534987), (7, 0.4948358)]
Speech #  486 [(3, 0.7083885), (7, 0.29125157)]
Speech #  487 [(2, 0.19272637), (3, 0.0766107), (7, 0.7287073)]
Speech #  488 [(1, 0.20490353), (3, 0.10580501), (7, 0.68555266)]
Speech #  489 [(3, 0.17944792), (7, 0.81978005)]
Speech #  490 [(3, 0.13203706), (5, 0.16177115), (7, 0.7049466)]
Speech #  491 [(3, 0.6757479), (7, 0.2901053), (9, 0.033940036)]
Speech #  492 [(3, 0.08257925), (7, 0.9165633)]
Speech #  493 [(3, 0.99653506)]
Speech #  494 [(3, 0.06635094), (7, 0.9329673)]
Speech #  495 [(3, 0.2985537), (7, 0.70110774)]
Speech #  496 [(2, 0.034092844), (3, 0.3013868), (7, 0.6642231)]
Speech #  497 [(4, 0.022106098), (7, 0.96703315)]
Speech #  498 [(3, 0.2430069), (4, 0.010365322), (7, 0.74617743)]
Speech

Speech #  649 [(3, 0.58849025), (7, 0.41089922)]
Speech #  650 [(3, 0.33708718), (7, 0.6624507)]
Speech #  651 [(3, 0.74742573), (7, 0.25213426)]
Speech #  652 [(3, 0.7680938), (7, 0.2315398)]
Speech #  653 [(3, 0.7240813), (7, 0.27559116)]
Speech #  654 [(3, 0.53560835), (7, 0.46413505)]
Speech #  655 [(3, 0.29073924), (7, 0.7087047)]
Speech #  656 [(3, 0.034625378), (7, 0.9648601)]
Speech #  657 [(3, 0.19526972), (7, 0.78928185), (8, 0.014484395)]
Speech #  658 [(3, 0.20537046), (6, 0.18479216), (7, 0.60943985)]
Speech #  659 [(3, 0.26958814), (7, 0.7299669)]
Speech #  660 [(3, 0.8946889), (7, 0.10458978)]
Speech #  661 [(3, 0.0746955), (7, 0.92486894)]
Speech #  662 [(3, 0.35813493), (6, 0.03982666), (7, 0.6013156)]
Speech #  663 [(3, 0.992984)]
Speech #  664 [(3, 0.23213579), (7, 0.7673594)]
Speech #  665 [(2, 0.05889808), (3, 0.34227017), (6, 0.08925593), (7, 0.50896066)]
Speech #  666 [(3, 0.19292076), (6, 0.40872845), (7, 0.39760575)]
Speech #  667 [(3, 0.9965665)]
Speech #  668

Speech #  815 [(3, 0.62939847), (7, 0.3698731)]
Speech #  816 [(3, 0.20281592), (6, 0.036423653), (7, 0.75461227)]
Speech #  817 [(3, 0.24860024), (7, 0.7438293)]
Speech #  818 [(3, 0.5419528), (7, 0.45759997)]
Speech #  819 [(3, 0.6058076), (7, 0.39380342)]
Speech #  820 [(7, 0.42873815), (8, 0.56417507)]
Speech #  821 [(3, 0.30649143), (7, 0.69320947)]
Speech #  822 [(3, 0.11252717), (7, 0.7546574), (9, 0.13224967)]
Speech #  823 [(3, 0.7430278), (7, 0.25675654)]
Speech #  824 [(3, 0.2547414), (7, 0.1868868), (9, 0.55306387)]
Speech #  825 [(3, 0.26578203), (4, 0.5582025), (7, 0.17462334)]
Speech #  826 [(3, 0.5671142), (7, 0.42313015)]
Speech #  827 [(2, 0.059815824), (3, 0.6957526), (7, 0.2437206)]
Speech #  828 [(3, 0.9911951)]
Speech #  829 [(3, 0.82663), (6, 0.071365476), (7, 0.10156222)]
Speech #  830 [(0, 0.3003256), (3, 0.43332908), (7, 0.26607713)]
Speech #  831 [(3, 0.48841968), (4, 0.43839842), (7, 0.072389)]
Speech #  832 [(7, 0.9976183)]
Speech #  833 [(3, 0.71497244), (

In [47]:
t=0
for i in lda.show_topics(num_topics=3, num_words=3, log=False, formatted=True):
    print ("Topic # ", t , i)
    t = t + 1

Topic #  0 (0, '0.000*"shall" + 0.000*"law" + 0.000*"congress"')
Topic #  1 (9, '0.000*"<applause.>" + 0.000*"tax" + 0.000*"it\'"')
Topic #  2 (2, '0.000*"<applause.>" + 0.000*"help" + 0.000*"it\'"')
