In [1]:
import pandas as pd
import regex, re, sys, nltk

from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from gensim.models.hdpmodel import HdpModel
from gensim.corpora.dictionary import Dictionary

from collections import Counter

import matplotlib.pyplot as plt

from pathlib import Path

src_path = str(Path.cwd().parent / "src")
sys.path.append(src_path)
# python file with all the functions (located in the src folder)
import topic_classification as tc

In [2]:
filepath = Path.cwd().parent / "speeches_csv" / "all_speeches_cleaned.txt"
df=pd.read_csv(filepath, usecols=['title','content'])
df.head()

Unnamed: 0,title,content
0,CGI_2013,Hillary Clinton: Thank you very much. I have t...
1,Prayer_Breakfast_2016,"Well, good morning. Giving all praise and hono..."
2,Security_Team_Announcement,"Good morning, everybody. I hope you all had a ..."
3,Cairo_University,Thank you so much. Good afternoon. I am honore...
4,Umpqua_Community_College_Shootings,There's been another mass shooting in America ...


In [3]:
df['text']=df['content'].str.lower()
df.head()

Unnamed: 0,title,content,text
0,CGI_2013,Hillary Clinton: Thank you very much. I have t...,hillary clinton: thank you very much. i have t...
1,Prayer_Breakfast_2016,"Well, good morning. Giving all praise and hono...","well, good morning. giving all praise and hono..."
2,Security_Team_Announcement,"Good morning, everybody. I hope you all had a ...","good morning, everybody. i hope you all had a ..."
3,Cairo_University,Thank you so much. Good afternoon. I am honore...,thank you so much. good afternoon. i am honore...
4,Umpqua_Community_College_Shootings,There's been another mass shooting in America ...,there's been another mass shooting in america ...


In [4]:
# tokenize into sentences
df['tokenized']=df['text'].apply(lambda text: nltk.sent_tokenize(text))
df[['text','tokenized']].head()

Unnamed: 0,text,tokenized
0,hillary clinton: thank you very much. i have t...,"[hillary clinton: thank you very much., i have..."
1,"well, good morning. giving all praise and hono...","[well, good morning., giving all praise and ho..."
2,"good morning, everybody. i hope you all had a ...","[good morning, everybody., i hope you all had ..."
3,thank you so much. good afternoon. i am honore...,"[thank you so much., good afternoon., i am hon..."
4,there's been another mass shooting in america ...,[there's been another mass shooting in america...


In [5]:
# the '\b(?!\d)' filters out expressions like '9th', since the first character cannot be a number
tokenizer = RegexpTokenizer(r'\b(?!\d)[a-zA-Z]+')
lemmatizer = WordNetLemmatizer()
df['normalized']=df['tokenized'].apply(lambda text: tc.normalize_text(text, tokenizer, lemmatizer))
df[['text','tokenized','normalized']].head()

Unnamed: 0,text,tokenized,normalized
0,hillary clinton: thank you very much. i have t...,"[hillary clinton: thank you very much., i have...",hillary clinton thank you very much i have th...
1,"well, good morning. giving all praise and hono...","[well, good morning., giving all praise and ho...",well good morning give all praise and honor t...
2,"good morning, everybody. i hope you all had a ...","[good morning, everybody., i hope you all had ...",good morning everybody i hope you all have a ...
3,thank you so much. good afternoon. i am honore...,"[thank you so much., good afternoon., i am hon...",thank you so much good afternoon i be honor t...
4,there's been another mass shooting in america ...,[there's been another mass shooting in america...,there s be another mass shooting in america t...


In [6]:
STOPWORDS = set(stopwords.words('english'))
df['fully_processed'] = df['normalized'].apply(lambda text: tc.remove_stopwords(text, STOPWORDS))
                
cnt = Counter()
for text in df['fully_processed'].values:
    # counts the number of speeches the word is in
    for word in set(text.split()):
        cnt[word] += 1
# words that are in most of the speeches
in_most_speeches = cnt.most_common(155)
in_most_speeches = [x[0] for x in in_most_speeches]

extra = ['mr', 'question', 'sure', 'obama', 'really', 'try', 'lot', 'important', 'million', 'talk', 'va', 'dr', 'romney',
        'folk', 'governor', 'republican', 'king', 'heart'] 

STOPWORDS_extra = set(in_most_speeches + extra)
# remove some words from the stopwords list that migth be important
STOPWORDS_extra = STOPWORDS_extra - set(['war', 'care', 'child', 'family', 'job', 'law', 'protect', 'security', 'power'])

df['fully_processed'] = df['fully_processed'].apply(lambda text: tc.remove_stopwords(text, STOPWORDS_extra))
df[['text','tokenized','normalized','fully_processed']].head()

Unnamed: 0,text,tokenized,normalized,fully_processed
0,hillary clinton: thank you very much. i have t...,"[hillary clinton: thank you very much., i have...",hillary clinton thank you very much i have th...,hillary clinton pleasure introduce speaker con...
1,"well, good morning. giving all praise and hono...","[well, good morning., giving all praise and ho...",well good morning give all praise and honor t...,morning praise honor god morning everyone orga...
2,"good morning, everybody. i hope you all had a ...","[good morning, everybody., i hope you all had ...",good morning everybody i hope you all have a ...,morning wonderful thanksgiving announce econom...
3,thank you so much. good afternoon. i am honore...,"[thank you so much., good afternoon., i am hon...",thank you so much good afternoon i be honor t...,afternoon honor timeless city cairo host remar...
4,there's been another mass shooting in america ...,[there's been another mass shooting in america...,there s be another mass shooting in america t...,mass shooting college oregon family mom dad ch...


In [7]:
texts = [text.split() for text in df['fully_processed'].values]

# Create a dictionary
# In gensim a dictionary is a mapping between words and their integer id
dictionary = Dictionary(texts)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)

# Create the bag-of-words format (list of (token_id, token_count))
corpus = [dictionary.doc2bow(text) for text in texts]

Hdp_model = HdpModel(corpus=corpus, id2word=dictionary)

In [12]:
from pprint import pprint
pprint(Hdp_model.print_topics(num_words=10))

[(0,
  '0.005*job + 0.004*family + 0.004*care + 0.003*economy + 0.003*business + '
  '0.003*security + 0.003*health + 0.003*war + 0.003*law + 0.003*child'),
 (1,
  '0.005*must + 0.005*security + 0.004*war + 0.004*nuclear + 0.004*peace + '
  '0.003*child + 0.003*job + 0.003*family + 0.003*freedom + 0.003*law'),
 (2,
  '0.005*job + 0.004*economy + 0.004*health + 0.004*business + 0.004*security '
  '+ 0.004*iran + 0.004*congress + 0.003*pay + 0.003*care + 0.003*family'),
 (3,
  '0.006*care + 0.006*job + 0.006*health + 0.004*business + 0.004*economy + '
  '0.004*system + 0.003*reform + 0.003*tax + 0.003*family + 0.003*pay'),
 (4,
  '0.004*war + 0.004*must + 0.003*security + 0.003*terrorist + 0.003*military '
  '+ 0.003*attack + 0.003*isil + 0.003*syria + 0.002*al + 0.002*protect'),
 (5,
  '0.008*gun + 0.005*job + 0.004*health + 0.004*care + 0.004*tax + 0.003*kid + '
  '0.003*law + 0.003*system + 0.003*percent + 0.003*family'),
 (6,
  '0.004*family + 0.003*job + 0.003*insurance + 0.003*care