In [41]:
import pandas as pd
import urllib
from textblob import TextBlob, Word, Blobber 
import numpy as np

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib
% matplotlib inline
import matplotlib.patches as mpatches
import seaborn as sns

from gensim import corpora, models
import pyLDAvis.gensim

In [2]:
with open('ClintonPresDebate1.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data1=myfile.read()
with open('ClintonPresDebate2.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data2=myfile.read()
with open('ClintonPresDebate3.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data3=myfile.read()   


In [3]:
clinton_debate_1 = data1
clinton_debate_2 = data2
clinton_debate_3 = data3

In [5]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # steming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
#     try:
#         text_processed.remove('b')
#     except: 
#         pass

    return text_processed 

In [28]:
texts = [text_process(clinton_debate_1)]

In [29]:
pyLDAvis.enable_notebook()

np.random.seed(42)

In [30]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = models.ldamodel.LdaModel(corpus, 
                                    id2word = dictionary, 
                                    num_topics = 2, 
                                    passes = 10, minimum_probability=0)

In [31]:
for text in texts:
    print(dictionary.doc2bow(text))

[(0, 1), (1, 29), (2, 4), (3, 36), (4, 2), (5, 7), (6, 1), (7, 1), (8, 22), (9, 1), (10, 7), (11, 3), (12, 19), (13, 15), (14, 17), (15, 21), (16, 5), (17, 6), (18, 7), (19, 2), (20, 1), (21, 4), (22, 1), (23, 37), (24, 17), (25, 8), (26, 13), (27, 31), (28, 6), (29, 6), (30, 4), (31, 25), (32, 14), (33, 21), (34, 18), (35, 3), (36, 7), (37, 8), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 4), (44, 1), (45, 4), (46, 4), (47, 19), (48, 8), (49, 13), (50, 20), (51, 1), (52, 8), (53, 3), (54, 12), (55, 2), (56, 1), (57, 1), (58, 4), (59, 3), (60, 7), (61, 3), (62, 12), (63, 2), (64, 3), (65, 4), (66, 6), (67, 5), (68, 6), (69, 1), (70, 13), (71, 32), (72, 1), (73, 3), (74, 8), (75, 4), (76, 11), (77, 2), (78, 1), (79, 8), (80, 1), (81, 12), (82, 7), (83, 2), (84, 2), (85, 1), (86, 3), (87, 6), (88, 1), (89, 1), (90, 2), (91, 10), (92, 2), (93, 3), (94, 40), (95, 8), (96, 5), (97, 4), (98, 3), (99, 2), (100, 2), (101, 2), (102, 5), (103, 3), (104, 10), (105, 9), (106, 6), (107, 1), (1

In [32]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [33]:
for i in ldamodel.print_topics(num_topics=2, num_words=7):
    print(i)

(0, '0.012*"would" + 0.012*"go" + 0.011*"think" + 0.010*"well" + 0.009*"peopl" + 0.009*"work" + 0.008*"donald"')
(1, '0.002*"would" + 0.002*"know" + 0.002*"well" + 0.002*"think" + 0.002*"go" + 0.002*"work" + 0.002*"peopl"')


UPDATE WITH DEBATE 2 DATA:

In [34]:
new_clinton_debate_2 = text_process(clinton_debate_2)

In [35]:
ldamodel.update([dictionary.doc2bow(new_clinton_debate_2)])

In [36]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [37]:
for i in ldamodel.print_topics(num_topics=2, num_words=7):
    print(i)

(0, '0.020*"peopl" + 0.018*"want" + 0.015*"donald" + 0.014*"get" + 0.013*"work" + 0.013*"go" + 0.013*"think"')
(1, '0.001*"would" + 0.001*"know" + 0.001*"well" + 0.001*"think" + 0.001*"go" + 0.001*"work" + 0.001*"peopl"')


UPDATE WITH DEBATE 3 DATA:

In [38]:
new_clinton_debate_3 = text_process(clinton_debate_3)
ldamodel.update([dictionary.doc2bow(new_clinton_debate_3)])
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [39]:
for i in ldamodel.print_topics(num_topics=2, num_words=7):
    print(i)

(0, '0.018*"go" + 0.018*"think" + 0.015*"peopl" + 0.013*"make" + 0.013*"donald" + 0.012*"said" + 0.012*"get"')
(1, '0.001*"stand" + 0.001*"debt" + 0.001*"opportun" + 0.001*"regul" + 0.001*"protect" + 0.001*"reform" + 0.001*"continu"')
