# Topic Model for POTUS Speech Corpus using Gensim

### Imports

In [43]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/williamLi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import stopwords from NLTK

In [29]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

### Import corpus

In [30]:
COLUMNS=['doc_id','date','pres','title','speech']
docs = pd.DataFrame(columns=COLUMNS)

In [31]:
import os
import re
_id = 1
for filename in os.listdir('./speeches'):
    if filename == '.DS_Store':
        continue
    for speech in os.listdir('./speeches/' + filename):
        temp = open('./speeches/' + filename + '/' + speech, 'r', encoding='utf-8').readlines()
        obj = {}
        obj['doc_id'] = _id
        date = re.findall('"([^"]*)"', temp[1])
        obj['date'] = date[0] if len(date) > 0 else None
        obj['pres'] = filename
        obj['title'] = re.findall('"([^"]*)"', temp[0])[0]
        obj['speech']= "".join(temp[2:])
    
        obj = pd.DataFrame(obj, index=[0])
        docs = docs.append(obj, ignore_index=True)
        _id += 1
docs = docs.set_index("doc_id")
docs.head()

Unnamed: 0_level_0,date,pres,title,speech
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"August 10, 1927",coolidge,Address at the Opening of Work on Mount Rushmo...,We have come here to dedicate a cornerstone th...
2,"December 8, 1925",coolidge,Third Annual Message,Members of the Congress: In meeting the consti...
3,"December 6, 1923",coolidge,First Annual Message,Since the close of the last Congress the Natio...
4,"October 20, 1925",coolidge,Message Regarding Relationship of Church and S...,"Mr. Moderator, Members Of The Council:\nIt is ..."
5,"March 4, 1925",coolidge,Inaugural Address,\nMy Countrymen:\n\nNo one can contemplate cur...


### Remove newline characters

In [34]:
data = docs.speech.values.tolist()
data = [re.sub('\n+', ' ', sent) for sent in data]

### Tokenize

In [36]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data_words = list(sent_to_words(data))

print(data_words[:1])

[['we', 'have', 'come', 'here', 'to', 'dedicate', 'cornerstone', 'that', 'was', 'laid', 'by', 'the', 'hand', 'of', 'the', 'almighty', 'on', 'this', 'towering', 'wall', 'of', 'rushmore', 'in', 'the', 'heart', 'of', 'the', 'black', 'hills', 'is', 'to', 'be', 'inscribed', 'memorial', 'which', 'will', 'represent', 'some', 'of', 'the', 'outstanding', 'features', 'of', 'four', 'of', 'our', 'presidents', 'laid', 'on', 'by', 'the', 'hand', 'of', 'great', 'artist', 'in', 'sculpture', 'this', 'memorial', 'will', 'crown', 'the', 'height', 'of', 'land', 'between', 'the', 'rocky', 'mountains', 'and', 'the', 'atlantic', 'seaboard', 'where', 'coming', 'generations', 'may', 'view', 'it', 'for', 'all', 'time', 'it', 'is', 'but', 'natural', 'that', 'such', 'design', 'should', 'begin', 'with', 'george', 'washington', 'for', 'with', 'him', 'begins', 'that', 'which', 'is', 'truly', 'characteristic', 'of', 'america', 'he', 'represents', 'our', 'independence', 'our', 'constitution', 'our', 'liberty', 'he', '

### Build the bigram and trigram models

In [41]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])



['we', 'have', 'come', 'here', 'to', 'dedicate', 'cornerstone', 'that', 'was', 'laid', 'by', 'the', 'hand', 'of', 'the', 'almighty', 'on', 'this', 'towering', 'wall', 'of', 'rushmore', 'in', 'the', 'heart', 'of', 'the', 'black_hills', 'is', 'to', 'be', 'inscribed', 'memorial', 'which', 'will', 'represent', 'some', 'of', 'the', 'outstanding', 'features', 'of', 'four', 'of', 'our', 'presidents', 'laid', 'on', 'by', 'the', 'hand', 'of', 'great', 'artist', 'in', 'sculpture', 'this', 'memorial', 'will', 'crown', 'the', 'height', 'of', 'land', 'between', 'the', 'rocky_mountains', 'and', 'the', 'atlantic_seaboard', 'where', 'coming', 'generations', 'may', 'view', 'it', 'for', 'all', 'time', 'it', 'is', 'but', 'natural', 'that', 'such', 'design', 'should', 'begin', 'with', 'george_washington', 'for', 'with', 'him', 'begins', 'that', 'which', 'is', 'truly', 'characteristic', 'of', 'america', 'he', 'represents', 'our', 'independence', 'our', 'constitution', 'our', 'liberty', 'he', 'formed', 'the

### Remove Stopwords, Make Bigrams and Lemmatize

In [13]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [38]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


[['come', 'dedicate', 'cornerstone', 'lay', 'hand', 'almighty', 'tower', 'wall', 'heart', 'black_hill', 'inscribe', 'memorial', 'represent', 'outstanding', 'feature', 'president', 'lay', 'hand', 'great', 'artist', 'sculpture', 'memorial', 'crown', 'height', 'land', 'rocky_mountain', 'come', 'generation', 'may', 'view', 'time', 'natural', 'design', 'begin', 'george_washington', 'begin', 'truly', 'characteristic', 'america', 'represent', 'independence', 'constitution', 'liberty', 'form', 'high', 'aspiration', 'entertain', 'people', 'permanent', 'institution', 'government', 'stand', 'foremost', 'disciple', 'order', 'liberty', 'statesman', 'inspire', 'vision', 'outrank', 'mortal', 'greatness', 'come', 'thomas_jefferson', 'whose', 'wisdom', 'insure', 'government', 'washington', 'form', 'entrust', 'administration', 'people', 'emphasize', 'element', 'self', 'government', 'enshrine', 'american', 'institution', 'way', 'demonstrate', 'practical', 'would', 'permanent', 'likewise', 'embody', 'spir

### Create the Dictionary and Corpus needed for Topic Modeling

In [21]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 3), (15, 7), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 4), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 5), (47, 1), (48, 1), (49, 1), (50, 2), (51, 2), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 2), (61, 1), (62, 2), (63, 1), (64, 3), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 3), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 3), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 1), (106, 1), (107, 2), (108, 2), (109, 4), (110, 1)

In [23]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 2),
  ('abraham_lincoln', 1),
  ('accomplishment', 1),
  ('acquire', 1),
  ('action', 1),
  ('add', 2),
  ('adequate', 1),
  ('administration', 1),
  ('admiration', 1),
  ('advantage', 2),
  ('allegiance', 1),
  ('almighty', 1),
  ('altogether', 1),
  ('amazed', 1),
  ('america', 3),
  ('american', 7),
  ('ancient', 1),
  ('appreciate', 1),
  ('art', 1),
  ('artist', 1),
  ('aspiration', 1),
  ('begin', 4),
  ('beheld', 1),
  ('belov', 1),
  ('beneficence', 1),
  ('black_hill', 1),
  ('bring', 1),
  ('build', 2),
  ('carve', 1),
  ('certain', 1),
  ('characteristic', 1),
  ('citizenship', 1),
  ('civil', 1),
  ('close', 1),
  ('colony', 1),
  ('columbus', 1),
  ('combine', 1),
  ('come', 4),
  ('comprehend', 1),
  ('conception', 1),
  ('conclusion', 1),
  ('constitute', 1),
  ('constitution', 1),
  ('continent', 1),
  ('continue', 3),
  ('cornerstone', 1),
  ('country', 5),
  ('countryman', 1),
  ('courage', 1),
  ('crown', 1),
  ('dakota', 2),
  ('day', 2),
  ('decidedly', 

### Build LDA model

In [44]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

### Print the Keyword in the 20 topics

In [45]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"government" + 0.009*"make" + 0.007*"work" + 0.007*"great" + '
  '0.007*"would" + 0.006*"law" + 0.005*"american" + 0.005*"good" + '
  '0.005*"need" + 0.005*"country"'),
 (1,
  '0.062*"cuba" + 0.035*"cuban" + 0.029*"israel" + 0.028*"palestinian" + '
  '0.020*"region" + 0.014*"muslim" + 0.013*"lebanon" + 0.013*"kosovo" + '
  '0.012*"italian" + 0.012*"middle_east"'),
 (2,
  '0.020*"november" + 0.019*"hon" + 0.019*"provisional" + 0.018*"minister" + '
  '0.016*"queen" + 0.014*"colon" + 0.014*"island" + 0.012*"annexation" + '
  '0.011*"sailor" + 0.010*"vessel"'),
 (3,
  '0.015*"northern_ireland" + 0.014*"nafta" + 0.009*"belfast" + '
  '0.004*"unsteady" + 0.003*"loyalist" + 0.003*"kantor" + 0.002*"mack" + '
  '0.002*"frenzel" + 0.002*"encaptur" + 0.002*"doodle"'),
 (4,
  '0.018*"state" + 0.014*"government" + 0.010*"may" + 0.008*"united" + '
  '0.007*"country" + 0.007*"public" + 0.007*"make" + 0.007*"would" + '
  '0.007*"congress" + 0.006*"power"'),
 (5,
  '0.017*"world" + 0.014*

### Compute Model Perplexity and Coherence Score

In [46]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.65931963845362

Coherence Score:  0.4304970555225175


### Visualize the topics

In [47]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis