# Topic Model for POTUS Speech Corpus

In [14]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [8]:
mallet_path = '/Users/williamLi/Documents/College/Semester\ 6/ds/mallet-2.0.6/bin/mallet'
COLUMNS=['doc_id','date','pres','title','speech']
docs = pd.DataFrame(columns=COLUMNS)

### Import Speeches

In [3]:
import os
import re
_id = 1
for filename in os.listdir('./speeches'):
    if filename == '.DS_Store':
        continue
    for speech in os.listdir('./speeches/' + filename):
        temp = open('./speeches/' + filename + '/' + speech, 'r', encoding='utf-8').readlines()
        obj = {}
        obj['doc_id'] = _id
        date = re.findall('"([^"]*)"', temp[1])
        obj['date'] = date[0] if len(date) > 0 else None
        obj['pres'] = filename
        obj['title'] = re.findall('"([^"]*)"', temp[0])[0]
        obj['speech']= "".join(temp[2:])
    
        obj = pd.DataFrame(obj, index=[0])
        docs = docs.append(obj, ignore_index=True)
        _id += 1
docs = docs.set_index("doc_id")
docs.head()

Unnamed: 0_level_0,date,pres,speech,title
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"August 10, 1927",coolidge,We have come here to dedicate a cornerstone th...,Address at the Opening of Work on Mount Rushmo...
2,"December 8, 1925",coolidge,Members of the Congress: In meeting the consti...,Third Annual Message
3,"December 6, 1923",coolidge,Since the close of the last Congress the Natio...,First Annual Message
4,"October 20, 1925",coolidge,"Mr. Moderator, Members Of The Council:\nIt is ...",Message Regarding Relationship of Church and S...
5,"March 4, 1925",coolidge,\nMy Countrymen:\n\nNo one can contemplate cur...,Inaugural Address


### Convert corpus to tokens and vocab
We use a function from TextMan, a bespoke library that incorporates the text processing routines used in earlier notebooks.

In [4]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='speech')
tokens['token_num'] = tokens.groupby(['doc_id']).cumcount()
tokens = tokens.reset_index()[['doc_id','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['doc_id','token_num'])

### Add term strings

In [5]:
tokens['term_str'] = tokens.term_id.map(vocab.term)
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,7179,come
1,1,9118,dedicate
1,2,8201,cornerstone
1,3,18308,laid
1,4,14872,hand


## Remove insignificant words

We use SKlearn's TFIDF vectorizor to quicky get a TFIDF vector space, which we use only to filter the words in our corpus.

In [6]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs['speech'].values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [7]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aaa,7.176906
16271,laptop,7.176906
16230,landless,7.176906
16239,landreau,7.176906
16242,landscaping,7.176906
16244,landslided,7.176906
16246,landward,7.176906
16249,langdon,7.176906
16250,langen,7.176906
16251,langfang,7.176906


## Export corpus for MALLET 

In [8]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [9]:
corpus.head()

Unnamed: 0,doc_id,doc_content
0,1,come dedicate cornerstone laid hand almighty t...
1,2,members congress meeting constitutional requir...
2,3,since close last congress nation lost presiden...
3,4,moderator members council understanding purpos...
4,5,countrymen one contemplate current conditions ...


In [10]:
corpus.to_csv('speech-corpus.csv', index=False)

In [9]:
!{mallet_path}

usage: dirname path
usage: dirname path
Unrecognized command: 
Mallet 2.0 commands: 

  import-dir        load the contents of a directory into mallet instances (one per file)
  import-file       load a single file into mallet instances (one per line)
  import-svmlight   load SVMLight format data files into Mallet instances
  train-classifier  train a classifier from Mallet data files
  classify-dir      classify data from a single file with a saved classifier
  classify-file     classify the contents of a directory with a saved classifier
  train-topics      train a topic model from Mallet data files
  infer-topics      use a trained topic model to infer topics for new documents
  evaluate-topics   estimate the probability of new documents under a trained model
  hlda              train a topic model using Hierarchical LDA
  prune             remove features based on frequency or information gain
  split             divide data into testing, training, and validation po

In [10]:
!{mallet_path} import-file --input speech-corpus.csv --output speech-corpus.mallet --keep-sequence TRUE

usage: dirname path
usage: dirname path
Error: Could not find or load main class cc.mallet.classify.tui.Csv2Vectors
Caused by: java.lang.ClassNotFoundException: cc.mallet.classify.tui.Csv2Vectors


In [19]:
!{mallet_path} train-topics --input speech-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters}

/bin/sh: {mallet_path}: command not found
