# Topic modeling
Unsupervised topic modeling using latent dirichlet allocation (LDA)

In [1]:
%matplotlib inline
from collections import defaultdict

import os
import re

import pickle as pkl

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

from gensim import matutils, corpora
from gensim.models import LdaModel, nmf
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.wrappers import LdaMallet
import scipy.sparse

from scipy.interpolate import splrep, splev

import pyLDAvis
from pyLDAvis.gensim import prepare

from matplotlib import pyplot as plt
import seaborn as sns

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


In [2]:
# let python and Gensim know where MALLET is (need to download mallet Java library to use)
mallet = False
if mallet:
    os.environ.update({'MALLET_HOME': 'mallet-2.0.8/'}) 
    mallet_path = 'mallet-2.0.8/bin/mallet'

Load the pickled speeches dataframe. Takes a bit. `cleaning and processing.ipynb` needs to have been run to create the pickle file loaded here.

In [3]:
with open("spacy_speeches_df.pkl", "rb") as f:
    speeches_df = pkl.load(f)

In [5]:
# sort by date
speeches_df = speeches_df.sort_values(by="date")

### Vectorize

In [6]:
stop_words = text.ENGLISH_STOP_WORDS.union(["pron", "president", "year", "happen", "thing", "let", "shall", "say",
                                           "henceforth", "heretofore", "probably", "come", "ought", "shown",
                                           "whereof", "think"])

# count vectorizer on the lemmatized text with no named entities
cv = CountVectorizer(stop_words=stop_words, min_df=3, max_df=0.8, 
                     ngram_range=(1,3), token_pattern="\\b[a-z][a-z][a-z]+\\b") # only include 3+ letter words
data_cv = cv.fit_transform(speeches_df["lemmatized_no_ents"])
dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
tdm = dtm.T

## Topic modeling

#### Latent Dirichlet Allocation (LDA)

In [7]:
## for the standard count vectorizer
# convert term doc matrix to gensim corpus
sparse_tdm = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_tdm)

# gensim also needs dictionary of terms and their locations in the tdm
# this is just the inverse of the countvectorizer vocabulary dictionary
id2word = {val: key for key, val in cv.vocabulary_.items()}

In [8]:
def run_lda(corpus, id2word, num_topics, mallet=False, mallet_path='mallet-2.0.8/bin/mallet'):
    
    # if trying multiple number of topics
    if type(num_topics) is list:
        coherences = []
        models = []
        for nts in num_topics:
            print("running LDA for {} topics".format(nts))
            if mallet:
                model = LdaMallet(mallet_path, corpus=corpus, id2word=id2word, num_topics=nts)
            else:
                model = LdaModel(corpus=corpus, id2word=id2word, num_topics=nts, passes=5, random_state=42)
            cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
            coherences.append(cm.get_coherence())
            models.append(model)
        return(models, coherences)
    
    # only one number of topics to try
    else:
        if mallet:
            model = LdaMallet(mallet_path, corpus=corpus, id2word=id2word, num_topics=num_topics)
        else:
            model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=5, random_state=42)
        cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
        coherence = cm.get_coherence()
        print("U mass coherence:", coherence)
        return(model)

In [9]:
# Note: MALLET LDA only works with count vectorized documents, not TFIDF
lda = run_lda(corpus, id2word, num_topics=20, mallet=False)

U mass coherence: -0.47664649557986805


In [10]:
topics = lda.print_topics(num_words=10)
for topic in topics:
    print("{}:".format(topic[0]+1), re.findall(r"[a-z]+", topic[1]))

1: ['treaty', 'republic', 'right', 'peace', 'work', 'article', 'war', 'canal', 'company', 'care']
2: ['power', 'law', 'right', 'war', 'peace', 'man', 'union', 'authority', 'world', 'work']
3: ['world', 'new', 'work', 'man', 'peace', 'know', 'war', 'help', 'need', 'force']
4: ['test', 'nuclear', 'work', 'war', 'world', 'treaty', 'new', 'power', 'know', 'peace']
5: ['law', 'election', 'person', 'act', 'officer', 'day', 'power', 'vote', 'duty', 'authority']
6: ['law', 'war', 'land', 'power', 'person', 'territory', 'day', 'duty', 'public', 'know']
7: ['know', 'right', 'world', 'work', 'war', 'power', 'law', 'man', 'far', 'force']
8: ['know', 'job', 'want', 'like', 'just', 'right', 'look', 'thank', 'new', 'deal']
9: ['right', 'work', 'know', 'job', 'world', 'want', 'child', 'just', 'new', 'thank']
10: ['world', 'peace', 'man', 'war', 'business', 'new', 'need', 'increase', 'far', 'law']
11: ['law', 'public', 'power', 'act', 'present', 'duty', 'subject', 'war', 'citizen', 'right']
12: ['world

In [11]:
s = ['law', 'power', 'public', 'act', 'present', 'duty', 'subject', 'war', 'increase', 'citizen']
" ".join(s)

'law power public act present duty subject war increase citizen'

### Visualize topics over time

In [12]:
gensim_dict = corpora.Dictionary.from_corpus(corpus, id2word)

In [13]:
topic_ids = list(range(lda.num_topics))
presidents = speeches_df["speaker"].unique()

topics_over_time = defaultdict(list)
for president in presidents:
    speeches = speeches_df["lemmatized_no_ents"][speeches_df["speaker"]==president].values
    for speech in speeches:
        bow = gensim_dict.doc2bow(speech.split(" ")) # bag of words format
        speech_topics = lda.get_document_topics(bow) # get predicted topics
        speech_prob_by_topic_id = {topic_tuple[0]: topic_tuple[1] for topic_tuple in speech_topics}
        for topic_id in topic_ids:
            if topic_id in speech_prob_by_topic_id.keys(): # if this was one of the predicted topics for this document
                topics_over_time[topic_id].append(speech_prob_by_topic_id[topic_id]) # append the prob
            else:
                topics_over_time[topic_id].append(0) # the prob of this topic was 0 for this document

In [14]:
dates = speeches_df["date"].values

plt.figure(figsize=(15,16))
#xs = list(range(len(list(topics_over_time.values())[0])))
for idx, (topic, probs) in enumerate(topics_over_time.items()):
    plt.subplot(7,3,idx+1)
    # use spline function to make smooth lines
    #bspl = splrep(xs, probs, s=1)
    #bspl_probs = splev(xs, bspl)
    
    plt.plot(dates, probs, label=topic)
    plt.title("Topic {}".format(topic+1))
plt.tight_layout()

### PyLDA

In [15]:
gensim_dict = corpora.Dictionary.from_corpus(corpus, id2word)

# can't visualize MALLET LDA models, only standard LDA
lda_vis_data = prepare(lda, corpus, gensim_dict, sort_topics=False)
pyLDAvis.display(lda_vis_data)

KeyboardInterrupt: 

### Non-Negative Matrix Factorization (NMF)
Didn't work as well as LDA

In [None]:
model = nmf.Nmf(corpus=corpus, id2word=id2word, num_topics=20, passes=5, random_state=42)
cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()

In [None]:
topics = model.print_topics()
for topic in topics:
    print(re.findall(r"[a-z]+", topic[1]))