In [130]:
import pandas as pd
import urllib
from textblob import TextBlob, Word, Blobber 
import numpy as np

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns

In [131]:
with open('TrumpPresDebate1.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data1=myfile.read()
with open('TrumpPresDebate2.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data2=myfile.read()
with open('TrumpPresDebate3.txt', 'r', encoding = "ISO-8859-1" ) as myfile:
    data3=myfile.read()

In [132]:
trump_debate_1 = data1
trump_debate_2 = data2
trump_debate_3 = data3

PREPROCESS TEXT:

In [133]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # steming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
#     try:
#         text_processed.remove('b')
#     except: 
#         pass

    return text_processed ## <-- one small tweak

In [157]:
text_process(trump_debate_1)

['ï',
 'thank',
 'lester',
 'job',
 'flee',
 'countri',
 'go',
 'mexico',
 'go',
 'mani',
 'countri',
 'look',
 'china',
 'countri',
 'term',
 'make',
 'product',
 'devalu',
 'currenc',
 'nobodi',
 'govern',
 'fight',
 'good',
 'fight',
 'win',
 'fight',
 'use',
 'countri',
 'piggi',
 'bank',
 'rebuild',
 'china',
 'mani',
 'countri',
 'thing',
 'lose',
 'good',
 'job',
 'mani',
 'look',
 'happen',
 'mexico',
 'friend',
 'mine',
 'build',
 'plant',
 'said',
 'eighth',
 'wonder',
 'world',
 'build',
 'biggest',
 'plant',
 'anywher',
 'world',
 'sophist',
 'best',
 'plant',
 'unit',
 'state',
 'said',
 'much',
 'ford',
 'leav',
 'see',
 'small',
 'car',
 'divis',
 'leav',
 'thousand',
 'job',
 'leav',
 'michigan',
 'leav',
 'ohio',
 'leav',
 'allow',
 'happen',
 'anymor',
 'far',
 'child',
 'care',
 'concern',
 'mani',
 'thing',
 'think',
 'hillari',
 'agre',
 'probabl',
 'disagre',
 'littl',
 'bit',
 'number',
 'amount',
 'go',
 'perhap',
 'talk',
 'later',
 'stop',
 'job',
 'stolen',
 

In [135]:
texts = [text_process(trump_debate_1)]

FIT LDA MODEL:

In [136]:
from gensim import corpora, models
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

np.random.seed(42)

In [140]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = models.ldamodel.LdaModel(corpus, 
                                    id2word = dictionary, 
                                    num_topics = 2, minimum_probability =0)

In [141]:
for text in texts:
    print(dictionary.doc2bow(text))

[(0, 1), (1, 1), (2, 13), (3, 28), (4, 1), (5, 64), (6, 68), (7, 6), (8, 29), (9, 48), (10, 9), (11, 6), (12, 10), (13, 2), (14, 2), (15, 1), (16, 8), (17, 1), (18, 7), (19, 20), (20, 8), (21, 9), (22, 1), (23, 4), (24, 1), (25, 40), (26, 7), (27, 17), (28, 3), (29, 3), (30, 7), (31, 3), (32, 35), (33, 1), (34, 4), (35, 12), (36, 5), (37, 5), (38, 1), (39, 4), (40, 4), (41, 6), (42, 20), (43, 1), (44, 18), (45, 12), (46, 3), (47, 3), (48, 2), (49, 8), (50, 2), (51, 3), (52, 4), (53, 1), (54, 8), (55, 1), (56, 8), (57, 3), (58, 43), (59, 9), (60, 15), (61, 5), (62, 2), (63, 6), (64, 3), (65, 8), (66, 1), (67, 4), (68, 18), (69, 2), (70, 13), (71, 1), (72, 15), (73, 28), (74, 2), (75, 35), (76, 22), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 4), (83, 3), (84, 6), (85, 5), (86, 13), (87, 3), (88, 1), (89, 20), (90, 13), (91, 1), (92, 5), (93, 3), (94, 8), (95, 9), (96, 1), (97, 22), (98, 6), (99, 5), (100, 2), (101, 2), (102, 4), (103, 7), (104, 13), (105, 5), (106, 18), (107, 12),

VISUALIZE LDA MODEL

In [142]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [144]:
for i in ldamodel.print_topics(num_topics=2, num_words=9):
    print(i)

(0, '0.015*"go" + 0.012*"countri" + 0.011*"look" + 0.009*"thing" + 0.009*"say" + 0.008*"think" + 0.007*"peopl" + 0.007*"compani" + 0.007*"said"')
(1, '0.015*"countri" + 0.013*"go" + 0.010*"think" + 0.010*"look" + 0.009*"say" + 0.008*"said" + 0.008*"peopl" + 0.008*"know" + 0.008*"thing"')


UPDATE MODEL WITH DEBATE 2 DATA:

In [148]:
new_trump_debate_2 = text_process(trump_debate_2)

In [149]:
ldamodel.update([dictionary.doc2bow(new_trump_debate_2)])

In [150]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [151]:
for i in ldamodel.print_topics(num_topics=2, num_words=7):
    print(i)

(0, '0.019*"go" + 0.015*"peopl" + 0.012*"thing" + 0.012*"look" + 0.011*"like" + 0.010*"say" + 0.010*"countri"')
(1, '0.018*"go" + 0.018*"peopl" + 0.014*"know" + 0.014*"countri" + 0.012*"look" + 0.012*"say" + 0.011*"thing"')


UPDATE MODEL WITH DEBATE 3 DATA:

In [152]:
new_trump_debate_3 = text_process(trump_debate_3)

In [153]:
ldamodel.update([dictionary.doc2bow(new_trump_debate_3)])

In [154]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [155]:
for i in ldamodel.print_topics(num_topics=2, num_words=7):
    print(i)

(0, '0.025*"go" + 0.015*"peopl" + 0.011*"look" + 0.011*"want" + 0.010*"say" + 0.010*"countri" + 0.010*"thing"')
(1, '0.031*"go" + 0.021*"peopl" + 0.016*"countri" + 0.014*"want" + 0.013*"know" + 0.013*"say" + 0.012*"get"')
