## Imports

In [2]:
import pandas as pd
import numpy as np
import json
import nltk
import streamlit as st
import spacy
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import LatentDirichletAllocation

import gensim
pyLDAvis.enable_notebook()

# local library
from preproc import *

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [3]:
with open('speeches.json') as f:
    speeches = json.load(f)

  and should_run_async(code)


In [4]:
bow = create_bow(speeches)

  and should_run_async(code)
2020-11-09 20:25:05.636 INFO    numexpr.utils: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-11-09 20:25:05.637 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


## Preprocess

In [5]:
TOKEN = tokenize(bow)

  and should_run_async(code)


In [6]:
VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

  and should_run_async(code)


In [7]:
# add term_id back to token
TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)

  and should_run_async(code)


In [8]:
NOUNS = TOKEN[TOKEN.pos.str.match(r'^NNS?$')]\
    .groupby(['id','speaker']).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'speech'})

  and should_run_async(code)


In [9]:
NOUNS

  and should_run_async(code)


Unnamed: 0_level_0,Unnamed: 1_level_0,speech
id,speaker,Unnamed: 2_level_1
-8KiohxTJ0Y,biden,wife sister folks things clothes count campaig...
-bHGb17kejM,trump,thank warning warning requests sir updates fra...
-oLFDfEAXa0,trump,county news conference service election day ye...
0pJnrHKoBHY,trump,music election tomorrow weve state cars gonna ...
17Xl75GwOaM,trump,thank vice president place village confidence ...
...,...,...
zIVdxsiu_r4,biden,reminder todays event distancing guideline gui...
zYz-QRYtR9U,trump,secretary carson secretary devos dr larry yarn...
zaaTZkqsaxY,trump,nothing thing closing rating everythin...
zb6x9zCKM3Q,pence,peace president juan carlos daughter carlina s...


In [10]:
stop_words = text.ENGLISH_STOP_WORDS.union(['music','applause'])

  and should_run_async(code)


In [11]:
biden = NOUNS.query('speaker == "biden"').speech
trump = NOUNS.query('speaker == "trump"').speech
pence = NOUNS.query('speaker == "pence"').speech
harris = NOUNS.query('speaker == "harris"').speech

  and should_run_async(code)


## Scikit-Learn

In [12]:
tf_vectorizer_biden = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = .75, 
                                min_df = .1)
tf_vectorizer_trump = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = .75, 
                                min_df = .1)
tf_vectorizer_pence = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = .75, 
                                min_df = .1)
tf_vectorizer_harris = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = .75, 
                                min_df = .1)
dtm_tf_biden = tf_vectorizer_biden.fit_transform(biden)
dtm_tf_trump = tf_vectorizer_trump.fit_transform(trump)
dtm_tf_pence = tf_vectorizer_pence.fit_transform(pence)
dtm_tf_harris = tf_vectorizer_harris.fit_transform(harris)

  and should_run_async(code)


In [13]:
# for TF DTM
lda_tf_biden = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf_biden.fit(dtm_tf_biden)
lda_tf_trump = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf_trump.fit(dtm_tf_trump)
lda_tf_pence = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf_pence.fit(dtm_tf_pence)
lda_tf_harris = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf_harris.fit(dtm_tf_harris)

  and should_run_async(code)


LatentDirichletAllocation(n_components=8, random_state=0)

In [14]:
pyLDAvis.sklearn.prepare(lda_tf_biden, dtm_tf_biden, tf_vectorizer_biden)

  and should_run_async(code)


In [15]:
pyLDAvis.sklearn.prepare(lda_tf_trump, dtm_tf_trump, tf_vectorizer_trump)

  and should_run_async(code)


In [16]:
pyLDAvis.sklearn.prepare(lda_tf_harris, dtm_tf_harris, tf_vectorizer_harris)

  and should_run_async(code)


In [17]:
pyLDAvis.sklearn.prepare(lda_tf_pence, dtm_tf_pence, tf_vectorizer_pence)

  and should_run_async(code)


In [18]:
biden_viz = pyLDAvis.sklearn.prepare(lda_tf_biden, dtm_tf_biden, tf_vectorizer_biden)
pyLDAvis.save_html(biden_viz, 'biden_viz.html')

  and should_run_async(code)


In [19]:
trump_viz = pyLDAvis.sklearn.prepare(lda_tf_trump, dtm_tf_trump, tf_vectorizer_trump)
pyLDAvis.save_html(trump_viz, 'trump_viz.html')

  and should_run_async(code)


In [20]:
harris_viz = pyLDAvis.sklearn.prepare(lda_tf_harris, dtm_tf_harris, tf_vectorizer_harris)
pyLDAvis.save_html(harris_viz, 'harris_viz.html')

  and should_run_async(code)


In [21]:
pence_viz = pyLDAvis.sklearn.prepare(lda_tf_pence, dtm_tf_pence, tf_vectorizer_pence)
pyLDAvis.save_html(pence_viz, 'pence_viz.html')

  and should_run_async(code)
