# ML Final Project
**Rebecca Driever, Michael Chen, Rayna Ji**

In [97]:
import pandas as pd
import re
from datetime import datetime
import string
import random
import numpy as np
import time
from scipy.stats import entropy
import seaborn as sns
sns.set_style("darkgrid")

import pymongo
from pymongo import MongoClient
import json
from bson.code import Code
from bs4 import BeautifulSoup as bs 

# NLP
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
# distances
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
# PCA and plot
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import matplotlib.pyplot as pPlot
from wordcloud import WordCloud, STOPWORDS
# import dependencies

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict

## Inspect data in MongoDB

In [3]:
client = MongoClient('localhost', 27017) # connect to MongoDB
db = client['fec'] # connect to the database
db.list_collection_names()

['artical_metadata', 'artical_main_content', 'delegates', 'fec Filings']

In [3]:
# connect to artical_main_content collection
coll_main = db['artical_main_content']

main = pd.DataFrame(list(coll_main.find()))
main.head()

Unnamed: 0,_id,article_index,main_content
0,5e6d9dcde410a40d1c62ff02,10,Before the first votes are cast in the Democra...
1,5e6d9dcde410a40d1c62ff03,11,"DES MOINES — Once upon a time, the winner of t..."
2,5e6d9dcde410a40d1c62ff04,12,"SIOUX CITY, Iowa — After a long campaign of id..."
3,5e6d9dcde410a40d1c62ff05,13,"CENTERVILLE, Iowa — Jill Biden was torn. Three..."
4,5e6d9dcde410a40d1c62ff06,22,"DES MOINES — For a full year, Democrats owned ..."


In [4]:
# connect to artical_metadata collection
coll_metadata = db['artical_metadata']

metadata = pd.DataFrame(list(coll_metadata.find()))
metadata.head()

Unnamed: 0,_id,abstract,article_index,byline,document_type,headline,keywords,news_desk,print_page,print_section,pub_date,section_name,slideshow_credits,snippet,source,subsection_name,type_of_material,uri,web_url,word_count
0,5e6da02ab0e10f7cb59126af,The Vermont senator has made exaggerated claim...,10,"{'original': 'By Linda Qiu', 'person': [{'firs...",article,{'main': 'Fact-Checking Bernie Sanders Before ...,"[{'name': 'persons', 'value': 'Sanders, Bernar...",Washington,18,A,2020-02-01T16:00:55+0000,U.S.,,The Vermont senator has made exaggerated claim...,The New York Times,Politics,News,nyt://article/3cc0d96d-5174-57f1-b6bc-723368db...,https://www.nytimes.com/2020/02/01/us/politics...,1849
1,5e6da02ab0e10f7cb59126b0,"As if the caucusing wasn’t confusing enough, o...",11,{'original': 'By Maggie Astor and Matt Stevens...,article,{'main': 'How Will the Winner of the Iowa Cauc...,"[{'name': 'subject', 'value': 'Presidential El...",Politics,17,A,2020-02-01T10:00:21+0000,U.S.,,"As if the caucusing wasn’t confusing enough, o...",The New York Times,Politics,News,nyt://article/ed9f6baf-a5b5-5bd1-a24e-d24e79e3...,https://www.nytimes.com/2020/02/01/us/politics...,1112
2,5e6da02ab0e10f7cb59126b1,With the Iowa caucuses set to kick off the pri...,12,{'original': 'By Jonathan Martin and Alexander...,article,{'main': 'Democratic Candidates Focus on All-C...,"[{'name': 'organizations', 'value': 'Democrati...",Politics,1,A,2020-02-01T17:00:09+0000,U.S.,,With the Iowa caucuses set to kick off the pri...,The New York Times,Politics,News,nyt://article/98ef5f2a-10d6-5a01-bd35-f675b58e...,https://www.nytimes.com/2020/02/01/us/politics...,1782
3,5e6da02ab0e10f7cb59126b2,She is a forceful surrogate for Joe Biden’s ca...,13,"{'original': 'By Katie Glueck and Steve Eder',...",article,{'main': 'Why Jill Biden Is Taking Time Off to...,"[{'name': 'persons', 'value': 'Biden, Joseph R...",Politics,16,A,2020-02-01T10:00:21+0000,U.S.,,She is a forceful surrogate for Joe Biden’s ca...,The New York Times,Politics,News,nyt://article/64ec07df-12ed-512e-8611-2ebed78d...,https://www.nytimes.com/2020/02/01/us/politics...,2065
4,5e6da02ab0e10f7cb59126b3,Iowa voted strongly for Barack Obama twice and...,22,{'original': 'By Trip Gabriel and Jeremy W. Pe...,article,{'main': 'Can Democrats Beat Trump in Iowa in ...,"[{'name': 'subject', 'value': 'Presidential El...",Politics,18,A,2020-02-02T17:30:08+0000,U.S.,,Iowa voted strongly for Barack Obama twice and...,The New York Times,Politics,News,nyt://article/aad55278-31e9-5ba0-8b6c-cca34be7...,https://www.nytimes.com/2020/02/02/us/politics...,1703


In [4]:
# connect to delegates collection
coll_del = db['delegates']

delegates = pd.DataFrame(list(coll_del.find()))
delegates.head()

Unnamed: 0,_id,biden_delegates,biden_win,sanders_delegates,sanders_win,state
0,5e6db6ebb0e10f7cb59129f6,6,0,12,0,Iowa
1,5e6db6ebb0e10f7cb59129f7,0,0,9,1,New Hampshire
2,5e6db6ebb0e10f7cb59129f8,9,0,24,1,Nevada
3,5e6db6ebb0e10f7cb59129f9,39,1,15,0,South Carolina
4,5e6db6ebb0e10f7cb59129fa,44,1,8,0,Alabama


In [5]:
# connect to fec Filings collection
coll_fec = db['fec Filings']

fec = pd.DataFrame(list(coll_fec.find()))
fec

Unnamed: 0,_id,cand_id,cand_nm,contb_receipt_amt,contb_receipt_dt,contbr_city,contbr_employer,contbr_nm,contbr_occupation,contbr_st,contbr_zip,election_tp,file_num,form_tp,memo_cd,memo_text,receipt_desc,tran_id
0,5e6e86ebb0e10f7cb5912a10,P60007168,"Sanders, Bernard",7.50,25-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16262901
1,5e6e86ebb0e10f7cb5912a11,P60007168,"Sanders, Bernard",2.70,24-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16282750
2,5e6e86ebb0e10f7cb5912a12,P60007168,"Sanders, Bernard",2.70,25-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16321511
3,5e6e86ebb0e10f7cb5912a13,P60007168,"Sanders, Bernard",2.70,24-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16328758
4,5e6e86ebb0e10f7cb5912a14,P60007168,"Sanders, Bernard",2.70,24-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16336803
5,5e6e86ebb0e10f7cb5912a15,P60007168,"Sanders, Bernard",2.70,26-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16350642
6,5e6e86ebb0e10f7cb5912a16,P60007168,"Sanders, Bernard",2.70,27-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16404128
7,5e6e86ebb0e10f7cb5912a17,P60007168,"Sanders, Bernard",2.70,27-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16407779
8,5e6e86ebb0e10f7cb5912a18,P60007168,"Sanders, Bernard",2.70,28-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16473894
9,5e6e86ebb0e10f7cb5912a19,P60007168,"Sanders, Bernard",2.70,28-JAN-20,DPO,UNITED STATES,"LEWIS, ANTHONY",MILITARY,AA,340110068.0,P2020,1384482,SA17A,,* EARMARKED CONTRIBUTION: SEE BELOW,,16475196


## Regression on delegates won

In [109]:
st_del = delegates["state"].value_counts()
len(st_del)

26

In [107]:
st = fec["contbr_st"].value_counts()
len(st)

60

In [6]:
# create the state abbreation dictionary dataframe
states_dict = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}
state_items = states_dict.items()
state_list = list(state_items)

state_df = pd.DataFrame(state_list,columns=['st_abb','state'])

In [55]:
# merge 
delegates_st = pd.merge(delegates,state_df,on="state")

# get the unique 26 states occuring in the delegates df.
states_26 = list(delegates_st["st_abb"].unique())

# filter out the funding file based on these 26 states
fec_26 = fec[fec.contbr_st.isin(states_26)] 

In [43]:
# get the state sum contribution amount
fec_biden_gb = fec_biden[['contbr_st', 'contb_receipt_amt']].groupby('contbr_st', as_index=False).sum()
fec_sanders_gb = fec_sanders[['contbr_st', 'contb_receipt_amt']].groupby('contbr_st', as_index=False).sum()

In [41]:
# a = fec_biden.groupby('contbr_nm')['contbr_st'].unique()
# fec_biden_gb_nm = pd.DataFrame.from_records(a.values.tolist()).stack().value_counts() # a series
# fec_biden_gb_nm.to_frame()

In [48]:
# get the state unique contributor numbers
fec_biden_gb_nm = fec_biden.groupby('contbr_st')['contbr_nm'].nunique()
fec_sanders_gb_nm = fec_sanders.groupby('contbr_st')['contbr_nm'].nunique()

fec_biden_gb_nm = fec_biden_gb_nm.reset_index()
fec_sanders_gb_nm = fec_sanders_gb_nm.reset_index()

In [52]:
# get the independent features ready
x_biden = pd.merge(fec_biden_gb_nm,fec_biden_gb,on='contbr_st')
x_sanders = pd.merge(fec_sanders_gb_nm,fec_sanders_gb,on='contbr_st')

In [53]:
x_biden

Unnamed: 0,contbr_st,contbr_nm,contb_receipt_amt
0,AL,356,201308.37
1,AR,159,72624.75
2,CA,9219,7654039.55
3,CO,1068,701015.58
4,IA,397,181933.93
5,ID,258,190933.5
6,ME,199,70132.8
7,MI,1048,475071.01
8,MN,488,204336.67
9,MO,532,385733.77


In [54]:
x_sanders

Unnamed: 0,contbr_st,contbr_nm,contb_receipt_amt
0,AL,521,158053.7
1,AR,430,131595.6
2,CA,28021,10847990.0
3,CO,3617,1155217.0
4,IA,1011,301911.5
5,ID,502,148652.6
6,ME,1040,321016.5
7,MI,3456,1092488.0
8,MN,2496,788358.0
9,MO,1607,484136.5


In [58]:
# get dependent variable ready
y_sanders = delegates_st[["sanders_win","st_abb"]]
y_biden = delegates_st[["biden_win","st_abb"]]

# rename for merging dataframes
y_sanders.rename(columns={'st_abb': 'contbr_st'})
y_biden.rename(columns={'st_abb': 'contbr_st'})

Unnamed: 0,biden_win,contbr_st
0,0,IA
1,0,NH
2,0,NV
3,1,SC
4,1,AL
5,1,AR
6,0,CA
7,0,CO
8,1,ME
9,1,MN


In [60]:
# merge the x and y
biden = pd.merge(y_biden,x_biden,on="contbr_st")
sanders = pd.merge(y_sanders,x_sanders,on="contbr_st")

In [61]:
biden

Unnamed: 0,biden_win,contbr_st,contbr_nm,contb_receipt_amt
0,0,IA,397,181933.93
1,0,NH,373,218623.72
2,0,NV,694,794405.86
3,1,SC,674,521078.64
4,1,AL,356,201308.37
5,1,AR,159,72624.75
6,0,CA,9219,7654039.55
7,0,CO,1068,701015.58
8,1,ME,199,70132.8
9,1,MN,488,204336.67


**Biden**

In [98]:
# split train and test data

# randomly assign true or false to each row so that we have 70% true.
random.seed(444)
pts = np.random.rand(len(biden)) < 0.7

biden_train = biden[pts]
biden_test = biden[~pts]

biden_train_x = biden_train.copy()
biden_train_x = biden_train_x.drop('biden_win', 1)
biden_train_x = biden_train_x.drop('contbr_st', 1)
biden_train_y = biden_train['biden_win']

In [99]:
# train logistic regression model
logreg = LogisticRegression()
logreg.fit(biden_train_x, biden_train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [100]:
# use test data to predict
biden_test_x = biden_test.copy()
biden_test_x = biden_test_x.drop('biden_win', 1)
biden_test_x = biden_test_x.drop('contbr_st', 1)
biden_test_y = biden_test['biden_win']

biden_pred = logreg.predict(biden_test_x)

In [101]:
# get accuracy
print('Accuracy : {:.3f}'.format(logreg.score(biden_test_x, biden_test_y)))

Accuracy : 0.571


**Sanders**

In [103]:
# split train and test data

# randomly assign true or false to each row so that we have 70% true.
random.seed(44)
pts = np.random.rand(len(sanders)) < 0.7

sanders_train = sanders[pts]
sanders_test = sanders[~pts]

sanders_train_x = sanders_train.copy()
sanders_train_x = sanders_train_x.drop('sanders_win', 1)
sanders_train_x = sanders_train_x.drop('contbr_st', 1)
sanders_train_y = sanders_train['sanders_win']

# train logistic regression model
logreg = LogisticRegression()
logreg.fit(sanders_train_x, sanders_train_y)

# use test data to predict
sanders_test_x = sanders_test.copy()
sanders_test_x = sanders_test_x.drop('sanders_win', 1)
sanders_test_x = sanders_test_x.drop('contbr_st', 1)
sanders_test_y = sanders_test['sanders_win']

sanders_pred = logreg.predict(sanders_test_x)

# get accuracy
print('Accuracy : {:.3f}'.format(logreg.score(sanders_test_x, sanders_test_y)))


Accuracy : 0.778


**Calculate the x variables based on diff of Biden and sanders**

In [73]:
whole = pd.merge(biden,sanders,on="contbr_st")

In [75]:
whole["contrb_nm_diff"] = whole["contbr_nm_x"] - whole["contbr_nm_y"] 
whole["contrb_amt_diff"] = whole["contb_receipt_amt_x"] - whole["contb_receipt_amt_y"] 

In [76]:
biden_diff = whole[["biden_win","contrb_nm_diff","contrb_amt_diff"]]

In [79]:
biden_diff

Unnamed: 0,biden_win,contrb_nm_diff,contrb_amt_diff
0,0,-614,-119977.5
1,0,-743,-142089.4
2,0,-465,426763.4
3,1,-137,277034.4
4,1,-165,43254.71
5,1,-271,-58970.84
6,0,-18802,-3193953.0
7,0,-2549,-454201.0
8,1,-841,-250883.7
9,1,-2008,-584021.4


In [105]:
# split train and test data

# randomly assign true or false to each row so that we have 70% true.
random.seed(4444)
pts = np.random.rand(len(biden_diff)) < 0.7

biden_diff_train = biden_diff[pts]
biden_diff_test = biden_diff[~pts]

biden_diff_train_x = biden_diff_train.copy()
biden_diff_train_x = biden_diff_train_x.drop('biden_win', 1)
biden_diff_train_y = biden_diff_train['biden_win']

# train logistic regression model
logreg = LogisticRegression()
logreg.fit(biden_diff_train_x, biden_diff_train_y)

# use test data to predict
biden_diff_test_x = biden_diff_test.copy()
biden_diff_test_x = biden_diff_test_x.drop('biden_win', 1)
biden_diff_test_y = biden_diff_test['biden_win']

biden_diff_pred = logreg.predict(biden_diff_test_x)

# get accuracy
print('Accuracy : {:.3f}'.format(logreg.score(biden_diff_test_x, biden_diff_test_y)))

Accuracy : 0.667


In [95]:
# CV
biden_diff_x = biden_diff.drop('biden_win', 1)
biden_diff_y = biden_diff['biden_win']

cv_predictions = cross_val_predict(logreg, biden_diff_x, biden_diff_y, cv=9)
cv_r2 = r2_score(biden_diff_y,cv_predictions)
print(cv_r2)

-0.2777777777777779


## Conjoint the main and metadata based on artical index

In [11]:
main_reduced = main[["main_content","article_index"]]

In [12]:
meta_reduced = metadata[["abstract","article_index","headline","keywords","pub_date"]]

In [22]:
# in case the valueerror: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat
main_reduced.iloc[:]['article_index'] = main_reduced['article_index'].astype(int)
meta_reduced.iloc[:]['article_index'] = meta_reduced['article_index'].astype(int)

articles = pd.merge(main_reduced, meta_reduced, on='article_index')

In [23]:
articles.head()

Unnamed: 0,main_content,article_index,abstract,headline,keywords,pub_date
0,Before the first votes are cast in the Democra...,10,The Vermont senator has made exaggerated claim...,{'main': 'Fact-Checking Bernie Sanders Before ...,"[{'name': 'persons', 'value': 'Sanders, Bernar...",2020-02-01T16:00:55+0000
1,Before the first votes are cast in the Democra...,10,"Joe Biden is “very much alive,” but the race i...",{'main': '5 Takeaways From the South Carolina ...,"[{'name': 'subject', 'value': 'Presidential El...",2020-03-01T10:00:10+0000
2,Before the first votes are cast in the Democra...,10,The Vermont senator has made exaggerated claim...,{'main': 'Fact-Checking Bernie Sanders Before ...,"[{'name': 'persons', 'value': 'Sanders, Bernar...",2020-02-01T16:00:55+0000
3,Before the first votes are cast in the Democra...,10,"Joe Biden is “very much alive,” but the race i...",{'main': '5 Takeaways From the South Carolina ...,"[{'name': 'subject', 'value': 'Presidential El...",2020-03-01T10:00:10+0000
4,"There was no drama this time, and no delay: Wi...",10,The Vermont senator has made exaggerated claim...,{'main': 'Fact-Checking Bernie Sanders Before ...,"[{'name': 'persons', 'value': 'Sanders, Bernar...",2020-02-01T16:00:55+0000


## Clean Data

In [65]:
# tokenize
articles["main_tokenized"] = articles["main_content"].apply(lambda x: word_tokenize(x))
articles["abstract_tokenized"] = articles["abstract"].apply(lambda x: word_tokenize(x))

In [82]:
# remove punctuations
articles["main_tokenized"] = articles["main_tokenized"].apply(lambda x: " ".join([word for word in x if word not in string.punctuation]))
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(lambda x: " ".join([word for word in x if word not in string.punctuation]))

In [83]:
# load stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

# remove stopwords
articles["main_tokenized"] = articles["main_tokenized"].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

[nltk_data] Downloading package stopwords to /Users/rayna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
# replace negations with not
articles["main_tokenized"] = articles["main_tokenized"].apply(lambda x: ' '.join(["not" if item.endswith("n't") else item for item in x.split()]))
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(lambda x: ' '.join(["not" if item.endswith("n't") else item for item in x.split()]))

In [85]:
# Attach all 'not's to the subsequent word.
def neg_concat(lst):
    e=""
    for item in re.split('(not )',lst):
        if item !='not ':
            e=e+item 
        else: e=e+"not-"
    return e

articles["main_tokenized"] = articles["main_tokenized"].apply(neg_concat)
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(neg_concat)

In [86]:
# stem
ps = PorterStemmer() 

# stem words
articles["main_tokenized"] = articles["main_tokenized"].apply(lambda x: ' '.join([ps.stem(item) for item in x.split()]))
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(lambda x: ' '.join([ps.stem(item) for item in x.split()]))

In [87]:
# tokenize again
articles["main_tokenized"] = articles["main_tokenized"].apply(lambda x: word_tokenize(x))
articles["abstract_tokenized"] = articles["abstract_tokenized"].apply(lambda x: word_tokenize(x))

## Wordcloud

In [42]:
abstract = str(articles["abstract_tokenized"])

cloud = WordCloud(background_color = "white", max_words = 200, stopwords = set(STOPWORDS))
cloud.generate(abstract)
cloud.to_file("wordCloud_tokenized.png")

<wordcloud.wordcloud.WordCloud at 0x12b1494a8>

In [43]:
abstract = str(articles["abstract"])

cloud = WordCloud(background_color = "white", max_words = 200, stopwords = set(STOPWORDS))
cloud.generate(abstract)
cloud.to_file("wordCloud.png")

<wordcloud.wordcloud.WordCloud at 0x1258c3518>

## Bad of word


In [56]:
# concat all words in all reviews
words_abstract = articles["abstract_tokenized"].str.cat(sep=' ')
words_main = articles["main_tokenized"].str.cat(sep=' ')
# words is a string

In [57]:
# frequency count
word_freq_abstract = nltk.FreqDist(words_abstract.split())
word_freq_main = nltk.FreqDist(words_main.split())

In [58]:
# most frequent 500 words
most_freq_abstract = dict(word_freq_abstract.most_common(500))
most_freq_main = dict(word_freq_main.most_common(500))

In [59]:
# bag-of-word vector
vectorizer_abstract = CountVectorizer(most_freq_abstract, max_features = 500)
BoW_vector_abstract = vectorizer_abstract.fit_transform(articles["abstract_tokenized"]).toarray()

vectorizer_main = CountVectorizer(most_freq_main, max_features = 500)
BoW_vector_main = vectorizer_main.fit_transform(articles["main_tokenized"]).toarray()

In [61]:
BoW_vector_main.size
# 500*len (number of reviews)

832500

# LDA model

In [89]:
def train_lda(data,column):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    num_topics = 10
    chunksize = 300
    dictionary = corpora.Dictionary(data[column])
    corpus = [dictionary.doc2bow(doc) for doc in data[column]]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)
    t2 = time.time()
    print("Time to train LDA model on ", len(articles), "articles: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [92]:
# LDA modeling on abstract
dictionary,corpus,lda = train_lda(articles,"abstract_tokenized")

Time to train LDA model on  1665 articles:  0.013164166609446208 min


In [93]:
# LDA modeling on main_content
dictionary_main,corpus_main,lda_main = train_lda(articles,"main_tokenized")

Time to train LDA model on  1665 articles:  0.07948104937871298 min


In [94]:
# for abstract
# show_topics method shows the the top num_words contributing to num_topics number of random topics
lda.show_topics(num_topics=10, num_words=20)

[(0,
  '0.143*"’" + 0.049*"here" + 0.046*"need" + 0.042*"end" + 0.042*"know" + 0.037*"day" + 0.035*"trump" + 0.035*"attack" + 0.034*"like" + 0.029*"presid" + 0.024*"possibl" + 0.024*"speaker" + 0.021*"american" + 0.021*"So" + 0.019*"elizabeth" + 0.018*"system" + 0.018*"thi" + 0.016*"the" + 0.014*"morn" + 0.014*"tip"'),
 (1,
  '0.084*"democrat" + 0.072*"’" + 0.063*"biden" + 0.063*"win" + 0.056*"joe" + 0.052*"2020" + 0.048*"tuesday" + 0.047*"voter" + 0.047*"race" + 0.047*"may" + 0.036*"A" + 0.034*"view" + 0.034*"presidenti" + 0.027*"far" + 0.026*"much" + 0.022*"candid" + 0.021*"support" + 0.020*"finish" + 0.014*"how" + 0.014*"while"'),
 (2,
  '0.059*"bloomberg" + 0.057*"’" + 0.057*"democrat" + 0.046*"the" + 0.035*"and" + 0.034*"super" + 0.034*"michael" + 0.034*"parti" + 0.029*"challeng" + 0.028*"tuesday" + 0.027*"take" + 0.027*"candid" + 0.023*"major" + 0.021*"want" + 0.020*"social" + 0.019*"outbreak" + 0.018*"—" + 0.017*"moder" + 0.016*"plan" + 0.016*"ad"'),
 (3,
  '0.103*"mani" + 0.060

In [96]:
# for abstract
# select and article at random from train_df
random_article_index = np.random.randint(len(articles))
bow = dictionary.doc2bow(articles.iloc[random_article_index,7])

# get the topic contributions for the document chosen at random above
doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])

# print the top 5 contributing topics and their words
for i in doc_distribution.argsort()[-5:][::-1]:
    print(i, lda.show_topic(topicid=i, topn=10), "\n")

1 [('democrat', 0.08362304), ('’', 0.07182661), ('biden', 0.06315168), ('win', 0.06313797), ('joe', 0.056095697), ('2020', 0.051645026), ('tuesday', 0.047896482), ('voter', 0.047116693), ('race', 0.04686646), ('may', 0.046717227)] 

9 [('south', 0.06736529), ('carolina', 0.06535404), ('presid', 0.062474508), ('’', 0.039399035), ('primari', 0.036983643), ('trump', 0.035761513), ('week', 0.03350238), ('democrat', 0.03282517), ('face', 0.031010348), ('the', 0.02820315)] 

8 [('need', 0.10178944), ('know', 0.07860957), ('’', 0.068862416), ('here', 0.06874848), ('administr', 0.04975968), ('first', 0.036713727), ("'s", 0.03587094), ('A', 0.033693418), ('pete', 0.032548673), ('coronaviru', 0.032338865)] 

7 [('”', 0.16018157), ('“', 0.16014685), ('call', 0.04254393), ('’', 0.041449428), ('presid', 0.037621513), ('campaign', 0.031342376), ('respons', 0.025049286), ('.', 0.022664975), ('fund', 0.02092896), ('trump', 0.019389745)] 

6 [('candid', 0.07070525), ('could', 0.068363994), ('nation', 0