In [9]:
import pandas as pd
import numpy as np
import json

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.linalg import norm

# local library
from preproc import *

In [2]:
sw = set(stopwords.words('english') + ["[applause]", "[music]"])

In [3]:
with open('speeches.json') as f:
    speeches = json.load(f)
bow = create_bow(speeches)

In [33]:
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
def my_tokenizer(document):
        tk = WhitespaceTokenizer()
        #stemmer = SnowballStemmer("english")
        # Break the sentence into tokens based on whitespace
        for token in tk.tokenize(document):
            # Apply preprocessing to the token
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')
            token = token.replace('"','')

            # If punctuation, ignore token and continue
            if all(char in set(string.punctuation) for char in token):
                continue
                
            if token in sw:
                continue

            # stem the token and yield
            #stem = stemmer.stem(token)
            yield token

In [65]:
fr_comments = pd.read_csv('fr_comments.csv', usecols=['post_id','comment'])
mr_comments = pd.read_csv('mr_comments2.csv', usecols=['post_id','comment'])
ml_comments = pd.read_csv('ml_comments2.csv', usecols=['post_id','comment'])
fl_comments = pd.read_csv('fl_comments2.csv', usecols=['post_id','comment'])

In [66]:
def process_comments(df):
    df = df.groupby('post_id').agg({'comment': lambda x: ' '.join(x)})\
            .reset_index().set_index('post_id')
    # create document-term count matrix
    vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                                 min_df=2,          # words must appear in this many speeches to count
                                 max_df=0.9,        # words can't appear in more % of speeches than this.  
                                 stop_words=stopwords.words('english'),
                                 tokenizer=my_tokenizer,
                                 ngram_range=(3,3)  # only bigrams
                                )
    counts = vectorizer.fit_transform(df.comment)
    counts_df = pd.DataFrame(counts.toarray())
    counts_df.index.name = 'post_id'
    counts_df.columns = vectorizer.get_feature_names()

    # convert counts into tfidf
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(counts)
    # make it a nice df
    tfidf_df = pd.DataFrame(tfidf.toarray())
    tfidf_df.index.name = 'post_id'
    tfidf_df.columns = vectorizer.get_feature_names()
    
    tfidf_group = tfidf_df.mean()

    # Normalize doc vector lengths
    tfidf_group = tfidf_group / norm(tfidf_group)

    return tfidf_group


In [67]:
fr_tfidf = process_comments(fr_comments)
mr_tfidf = process_comments(mr_comments)
ml_tfidf = process_comments(ml_comments)
fl_tfidf = process_comments(fl_comments)

In [69]:
ml_tfidf.sort_values(ascending=False)

[removed] [removed] [removed]                                         0.287505
please [contact moderators                                            0.148574
[contact moderators subreddit](/message/compose/?to=/r/democrats)     0.148574
moderators subreddit](/message/compose/?to=/r/democrats) questions    0.148574
bot, action performed                                                 0.148574
                                                                        ...   
middle class well,                                                    0.006755
no, i'm saying                                                        0.006735
ever going happen.                                                    0.006289
law says allowed                                                      0.006161
feel like would                                                       0.005795
Length: 1529, dtype: float64

In [70]:
print(fr_tfidf.shape)
print(mr_tfidf.shape)
print(ml_tfidf.shape)
print(fl_tfidf.shape)

(5606,)
(1779,)
(1529,)
(1225,)


In [71]:
print(fr_comments.shape)
print(mr_comments.shape)
print(ml_comments.shape)
print(fl_comments.shape)

(25522, 2)
(19222, 2)
(19236, 2)
(15566, 2)


In [None]:
# look at trigrams from speeches

In [100]:
# create document-term count matrix
vectorizer = CountVectorizer(#max_features=5000, # only top 5k words
                             min_df=3,          # words must appear in this many speeches to count
                             max_df=0.9,        # words can't appear in more % of speeches than this.  
                             stop_words=stopwords.words('english'),
                             tokenizer=my_tokenizer,
                             ngram_range=(3,3)  # only bigrams
                            )
counts = vectorizer.fit_transform(bow['speech'])
counts_df = pd.DataFrame(counts.toarray())
counts_df.index.name = 'speech'
counts_df.columns = vectorizer.get_feature_names()

# convert counts into tfidf
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(counts)
# make it a nice df
tfidf_df = pd.DataFrame(tfidf.toarray())
tfidf_df.index.name = 'speech'
tfidf_df.columns = vectorizer.get_feature_names()
tfidf_df['videoId'] = bow.index.values
tfidf_df = tfidf_df.set_index('videoId')

# group by speaker and aggregate
tfidf_df['_speaker'] = tfidf_df.apply(lambda x: bow.loc[x.name].speaker, 1)
tfidf_df = tfidf_df.reset_index().set_index(['videoId','_speaker'])
# collapse to speaker-only
tfidf_speaker = tfidf_df.reset_index().drop(columns="videoId").groupby(['_speaker']).mean()
# Normalize doc vector lengths
tfidf_speaker = tfidf_speaker.apply(lambda x: x / norm(x), 1)
# transpose for easier filtering
tfidf_speaker = tfidf_speaker.T.reset_index()
tfidf_speaker = tfidf_speaker.set_index('index')
tfidf_speaker.index.name = 'term'

In [101]:
df = fr_tfidf.to_frame(name="fr_tfidf")
df.index.name = "term"
tfidf_speaker = tfidf_speaker.merge(df, left_index=True, right_index=True, how="left").fillna(value=0)

df = mr_tfidf.to_frame(name="mr_tfidf")
df.index.name = "term"
tfidf_speaker = tfidf_speaker.merge(df, left_index=True, right_index=True, how="left").fillna(value=0)

df = ml_tfidf.to_frame(name="ml_tfidf")
df.index.name = "term"
tfidf_speaker = tfidf_speaker.merge(df, left_index=True, right_index=True, how="left").fillna(value=0)

df = fl_tfidf.to_frame(name="fl_tfidf")
df.index.name = "term"
tfidf_speaker = tfidf_speaker.merge(df, left_index=True, right_index=True, how="left").fillna(value=0)

In [136]:
tfidf_speaker.sort_values('ml_tfidf', ascending=False).head(20)

Unnamed: 0_level_0,biden,harris,pence,trump,fr_tfidf,mr_tfidf,ml_tfidf,fl_tfidf
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
president united states,0.114247,0.100319,0.038685,0.053622,0.017452,0.010878,0.058226,0.023538
make america great,0.009783,0.006666,0.023989,0.090591,0.028879,0.024063,0.05679,0.0
social security medicare,0.01335,0.021886,0.0,0.008777,0.0,0.0,0.046712,0.0
supreme court justice,0.0,0.038663,0.0,0.019644,0.002016,0.0,0.043449,0.0
america great again.,0.016191,0.0,0.025334,0.002533,0.011739,0.0,0.035821,0.0
every single one,0.018445,0.0,0.003029,0.00842,0.009939,0.010084,0.033504,0.041587
quid pro quo,0.0,0.0,0.0,0.027212,0.004538,0.0,0.032137,0.0
every single day,0.041058,0.0,0.047543,0.018536,0.006958,0.026228,0.031832,0.0
black lives matter,0.023359,0.165029,0.0,0.054068,0.054169,0.171205,0.031238,0.026953
two years ago.,0.00861,0.0,0.008435,0.002307,0.0,0.0,0.029061,0.0


In [123]:
speakers = tfidf_speaker.loc[:,['biden','harris','pence','trump']]
ideo = tfidf_speaker.loc[:,['fr_tfidf', 'mr_tfidf', 'ml_tfidf','fl_tfidf']]

In [131]:
ideo.T.dot(speakers)

Unnamed: 0,biden,harris,pence,trump
fr_tfidf,0.014402,0.023,0.008926,0.021024
mr_tfidf,0.031793,0.048131,0.010873,0.03708
ml_tfidf,0.028751,0.029785,0.026056,0.030849
fl_tfidf,0.015091,0.012691,0.004009,0.010886
