# Semi-supervised topic modeling
In this notebook I will try semi-supervised topic modeling. Topics are built around seed words provided for each topic. Implemented in Corex.

In [1]:
%matplotlib inline
import os

from collections import defaultdict

import pickle as pkl

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

import scipy.sparse

from scipy.interpolate import splrep, splev

from matplotlib import pyplot as plt
import seaborn as sns

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

from datetime import datetime

import umap

  import numba.targets


### Load speeches

In [2]:
# load the speeches
# large file, takes a while to load
load_spacy_speeches_df = True
if load_spacy_speeches_df:
    with open("spacy_speeches_df.pkl", "rb") as f:
        speeches_df = pkl.load(f)

In [3]:
# get dataframe of speeches that were given after 1900
speeches_post_1900 = speeches_df[speeches_df["date"] >= datetime.strptime("1900, 01, 01", "%Y, %m, %d")]
speeches_post_1900.reset_index(drop=True, inplace=True)

### Vectorize

In [4]:
stop_words = text.ENGLISH_STOP_WORDS.union(["pron", "president", "year", "happen", "thing", "let", "shall", "say",
                                           "henceforth", "heretofore", "probably", "come", "ought", "shown",
                                           "whereof"])

# count vectorizer on the lemmatized text with no named entities
cv = CountVectorizer(stop_words=stop_words, min_df=3, max_df=0.8, 
                     ngram_range=(1,3), token_pattern="\\b[a-z][a-z][a-z]+\\b") # only include 3+ letter words
data_cv = cv.fit_transform(speeches_post_1900["lemmatized_no_ents"])
dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
tdm = dtm.T

### Corex

In [5]:
# stuff for the corex function
words = list(np.asarray(cv.get_feature_names()))

# create a list of anchor words to seed the topic formations
anchor_words = [["immigration", "border"], ["health", "care"], ["national", "security"], 
               ["economy"], ["trade"], ["education"], ["war", "military"]]

In [6]:
# n_hidden is the number of topics
topic_model = ct.Corex(n_hidden=7, words=words, seed=42)
topic_model.fit(data_cv, words=words, docs=speeches_post_1900.content.values, anchors=anchor_words, anchor_strength=2)

<corextopic.corextopic.Corex at 0x1a1e64b8d0>

In [7]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics(n_words=10)
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: immigration, border, email, bring job, obamacare, folk, repeal replace, deplete, everybody, isis
1: care, health, tax, taxis, money, regulation, business, cut, crime, percent
2: security, national, reduce, program, meet, proposal, problem, social, plan, provide
3: economy, budget, growth, economic, investment, market, cost, revenue, inflation, development
4: trade, manufacture, wall, protect, sell, away, world trade, partnership, open, small
5: education, low, deficit, rate, product, change, fund, worker, create, strong
6: military, war, policy, middle, north, use, win, turn, defense, political


In [8]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.title("topic correlation (higher is better)", fontsize=16);

### Visualizing the results

In [9]:
# add the topics back into the dataframe
topic_bool = topic_model.labels.T.astype(int)
topic_cols = {"topic {}".format(idx+1): bools for idx, bools in enumerate(topic_bool)}

speeches_with_topics = pd.concat([speeches_post_1900, pd.DataFrame(topic_cols)], axis=1)

In [10]:
cols = list(speeches_with_topics.columns)
cols[4:11] = ["immigration", "health care", "national security", "economy", "trade", "education",  "conflict"]
speeches_with_topics.columns = cols

In [11]:
# get presidents in the right order
presidents = speeches_with_topics["speaker"].unique()
presidents = presidents[[0,1,2,3,4,5,6,7,8,10,11,12,9,14,15,13,16,17,18,19,20]]

In [12]:
# calculate percentage of speeches including a topic for each president
percents_by_president_by_topic = {}
for idx, topic_name in enumerate(speeches_with_topics.columns[4:]):
    percents_by_president = {}
    for pres in presidents:
        speeches_this_topic_this_pres = speeches_with_topics[topic_name][speeches_with_topics["speaker"]==pres].values
        in_topic = sum(speeches_this_topic_this_pres)
        percent = in_topic/len(speeches_this_topic_this_pres)
        percents_by_president[pres] = percent
    percents_by_president_by_topic[topic_name] = percents_by_president

In [13]:
plt.figure(figsize=(18,16))
for idx, topic in enumerate(list(percents_by_president_by_topic.keys())):
    plt.subplot(3,3,idx+1)
    presidents = list(percents_by_president_by_topic[topic].keys())
    percents = list(percents_by_president_by_topic[topic].values())
    plt.bar(presidents, percents)
    plt.ylabel("percent of speeches incl. topic")
    plt.xticks(rotation=40)
    plt.title(topic, fontsize=18)
plt.tight_layout()
#plt.savefig("topics_by_pres.png", dpi=250)

In [17]:
topic = "economy"
plt.figure(figsize=(12,8))
presidents = list(percents_by_president_by_topic[topic].keys())
percents = list(percents_by_president_by_topic[topic].values())
plt.bar(presidents, percents)
plt.ylabel("percent\nof\nspeeches\nincluding\ntopic", fontsize=16, rotation=0, labelpad=45)
plt.xticks(rotation=50, fontsize=16)
plt.title("'Economy' Topic by President", fontsize=26)
plt.tight_layout()
#plt.savefig("economy_topic.png", dpi=250)

## UMAP

In [15]:
reducer = umap.UMAP(random_state=42)
reducer.fit(speeches_with_topics.iloc[:,4:].values);



KeyboardInterrupt: 

In [None]:
embedding = reducer.transform(speeches_with_topics.iloc[:,4:].values)

In [None]:
topic_names = speeches_with_topics.columns[4:]

# add a column showing if a speech was not assigned a topic for visualization purposes
speeches_with_topics["no topic"] = (speeches_with_topics[topic_names].values.sum(axis=1) == 0).astype(int)
len(speeches_with_topics[speeches_with_topics["no topic"]==1])

In [None]:
topic_names = speeches_with_topics.columns[4:].values

plt.figure(figsize=(26,14))
for idx, topic in enumerate(topic_names):
    plt.subplot(2,4,idx+1)
    colors = [sns.color_palette()[x] for x in speeches_with_topics[topic]]
    plt.scatter(embedding[:, 0], embedding[:, 1], alpha=.5, c=colors)
    plt.title(topic, fontsize=16);
plt.tight_layout()
# plt.savefig("umaps.png", dpi=250)

In [None]:
plt.figure(figsize=(18,10))
colors = [sns.color_palette()[x] for x in (speeches_with_topics["speaker"]=="trump").astype(int)]
plt.scatter(embedding[:, 0], embedding[:, 1], alpha=.5, c=colors);