# Topic Model for POTUS Speech Corpus

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
mallet_path = r'C:\Users\William\Documents\College\SPR2018\DS5559\mallet-2.0.8\mallet-2.0.8\bin\mallet'
COLUMNS=['doc_id','date','pres','title','speech']
docs = pd.DataFrame(columns=COLUMNS)

### Import Speeches

In [3]:
import os
import re
_id = 1
pres = []
for filename in os.listdir('./speeches'):
    pres.append(filename)
    for speech in os.listdir('./speeches/' + filename):
        temp = open('./speeches/' + filename + '/' + speech, 'r', encoding='utf-8').readlines()
        obj = {}
        obj['doc_id'] = _id
        date = re.findall('"([^"]*)"', temp[1])
        obj['date'] = date[0] if len(date) > 0 else None
        obj['pres'] = filename
        obj['title'] = re.findall('"([^"]*)"', temp[0])[0]
        obj['speech']= "".join(temp[2:])
    
        obj = pd.DataFrame(obj, index=[0])
        docs = docs.append(obj, ignore_index=True)
        _id += 1
docs = docs.set_index("doc_id")
docs.head()

Unnamed: 0_level_0,date,pres,title,speech
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"May 16, 1797",adams,Special Session Message to Congress,The personal inconveniences to the members of ...
2,"March 4, 1797",adams,Inaugural Address,"When it was first perceived, in early times, t..."
3,"December 8, 1798",adams,Second Annual Message,Gentlemen of the Senate and Gentlemen of the H...
4,"March 23, 1798",adams,"Proclamation of Day of Fasting, Humiliation an...",As the safety and prosperity of nations ultima...
5,"December 3, 1799",adams,Third Annual Message,It is with peculiar satisfaction that I meet t...


### Convert corpus to tokens and vocab
We use a function from TextMan, a bespoke library that incorporates the text processing routines used in earlier notebooks.

In [4]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='speech')
tokens['token_num'] = tokens.groupby(['doc_id']).cumcount()
tokens = tokens.reset_index()[['doc_id','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['doc_id','token_num'])

### Add term strings

In [5]:
tokens['term_str'] = tokens.term_id.map(vocab.term)
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,23092,personal
1,1,16448,inconveniences
1,2,19972,members
1,3,27777,senate
1,4,15709,house


## Remove insignificant words

We use SKlearn's TFIDF vectorizor to quicky get a TFIDF vector space, which we use only to filter the words in our corpus.

In [6]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs['speech'].values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [7]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aaa,7.176906
16271,laptop,7.176906
16230,landless,7.176906
16239,landreau,7.176906
16242,landscaping,7.176906
16244,landslided,7.176906
16246,landward,7.176906
16249,langdon,7.176906
16250,langen,7.176906
16251,langfang,7.176906


## Export corpus for MALLET 

In [8]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [9]:
corpus.head()

Unnamed: 0,doc_id,doc_content
0,1,personal inconveniences members senate house r...
1,2,first perceived early times middle course amer...
2,3,gentlemen senate gentlemen house representativ...
3,4,safety prosperity nations ultimately essential...
4,5,peculiar satisfaction meet congress united sta...


In [10]:
corpus.to_csv('speech-corpus.csv', index=False)

In [11]:
!{mallet_path}

Mallet 2.0 commands: 
  import-dir        load the contents of a directory into mallet instances (one per file)
  import-file       load a single file into mallet instances (one per line)
  import-svmlight   load a single SVMLight format data file into mallet instances (one per line)
  info              get information about Mallet instances
  train-classifier  train a classifier from Mallet data files
  classify-dir      classify data from a single file with a saved classifier
  classify-file     classify the contents of a directory with a saved classifier
  classify-svmlight classify data from a single file in SVMLight format
  train-topics      train a topic model from Mallet data files
  infer-topics      use a trained topic model to infer topics for new documents
  evaluate-topics   estimate the probability of new documents given a trained model
  prune             remove features based on frequency or information gain
  split             divide data into testing, training, and va

In [12]:
!{mallet_path} import-file --input speech-corpus.csv --output novels-corpus.mallet --keep-sequence TRUE

Picked up _JAVA_OPTIONS: -Xmx1024m


In [None]:
!{mallet_path} train-topics --input speech-corpus.mallet --num-topics {20} --num-iterations {1000} \
--output-doc-topics speech-topics.txt \
--output-topic-keys speech-topics-keys.txt \
--word-topic-counts-file speech-word-topic-counts-file.txt \
--topic-word-weights-file speech-topic-word-weights-file.txt \
--xml-topic-report speech-topic-report.xml \
--xml-topic-phrase-report speech-topic-phrase-report.xml \
--show-speech-interval {100} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file speech-diagnostics.xml