### transform_data_tfidf_topic_modelers

* transform tokenized text to word matrix using tfidf vectorizer
* build 3 topic modlers: lsa, lda and nmf



In [2]:
import pymongo
from pymongo import MongoClient
import datetime
import pickle
import nltk
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


%matplotlib inline
plt.style.use('seaborn')

#### generate a sample from tokenized presidential document collection

In [3]:
client = MongoClient()
db = client.test_database
presidency_docs_stem = db.presidency_docs_stem
cursor = presidency_docs_stem.find({})

records = [record for record in cursor]
docs = [records[i]['stem_text'] for i in range(len(records))]
authors = [records[i]['author'] for i in range(len(records))]
titles = [records[i]['title'] for i in range(len(records))]
dates = [records[i]['date'] for i in range(len(records))]

In [101]:
# unique authors in the presidential document collection
# includes presidetnts, vice presidents, presidential candidates, 
# house/senate speakers, state secretaries, first ladies, etc.

print(set(authors)) 

{'William Henry Harrison', 'Wendell Willkie', 'Michael S. Dukakis', 'Lindsey Graham', 'Abraham Lincoln', 'John Anderson', 'Robert Gibbs', 'Republican Party Platforms', 'Bernie Sanders', 'Herbert Hoover', 'Scott McClellan', 'Scott Walker', 'John Tyler', 'Walter F. Mondale', 'George Bush', 'Harry S. Truman', 'Laura Bush', 'Chris Christie', 'John Kasich', 'Andrew Johnson', 'John McCain', 'Sean Spicer', 'Donald J. Trump', 'Joe Biden', 'Franklin D. Roosevelt', 'Jack Kemp', 'Newt Gingrich', 'Bill Richardson', 'Tony Snow', 'Christopher Dodd', 'Barack Obama', 'Jay Carney', 'Franklin Pierce', 'Gerald R. Ford', 'Lincoln Chafee', 'George W. Bush', 'Sarah Huckabee Sanders', 'Herman Cain', 'Melania Trump', 'Ari Fleischer', 'Bobby Jindal', 'H. Ross Perot', 'Marco Rubio', 'U.S. Congress', 'George Washington', 'Ulysses S. Grant', 'Michelle Obama', 'Theodore Roosevelt', 'Hillary Clinton', 'Thomas Jefferson', 'James K. Polk', 'Andrew Jackson', 'George Pataki', 'Grover Cleveland', 'Barry Goldwater', 'Ric

In [4]:
presidents=['Donald J. Trump','Richard Nixon','John Adams','Lyndon B. Johnson','Dwight D. Eisenhower','Woodrow Wilson','Abraham Lincoln',
 'John F. Kennedy','George W. Bush','Grover Cleveland','William J. Clinton','Franklin D. Roosevelt','James Madison','Ulysses S. Grant','Barack Obama','William Howard Taft',
 'Andrew Jackson',
 'George Bush',
 'Ronald Reagan',
 'Harry S. Truman',
 'Gerald R. Ford','Herbert Hoover','George Washington','William Henry Harrison','Jimmy Carter','Warren G. Harding','Thomas Jefferson','Theodore Roosevelt']

In [7]:
# lastest presidents
latest_presidents = ['Donald J. Trump','Barack Obama','George W. Bush']
latest_presidents_docs = []
latest_presidents_authors=[]
for i in range(len(authors)):
    if authors[i] in latest_presidents:
        latest_presidents_authors.append(authors[i])
        latest_presidents_docs.append(docs[i])

In [8]:
# generate sample of all docs 
import random
sample_size = 30000

sample_docs_indices = sorted(random.sample(range(len(docs)), sample_size))
sample_docs = [docs[i] for i in sample_docs_indices]
sample_authors = [authors[i] for i in sample_docs_indices]

#### transform tokenized text to word matrix. 
#### choose tfidf over count vectorizer to give earlier presidential documents a fair weight because modern presidents have lots more documents than the earlier ones.

In [12]:

# documents have already been cleaned, lowercased, tokenized and stored in database
# earlier to speed up the computation of tfidf vectorizer. So here no need to repeat
# these steps.
sample_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   max_features=5000, 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   # documents have already been
                                   # lowercased
                                   lowercase=False,
                                   max_df = 0.6)

In [13]:
gen = (sample_docs[i] for i in range(len(sample_docs)))
sample_tfidf_data = sample_tfidf_vectorizer.fit_transform(gen)

#### build topic modelers (nmf, lsa, lda) ;  save them and transformed data in 200 topic space in pickle files

In [17]:
from sklearn.decomposition import NMF, TruncatedSVD
n_comp = 200
#lsa_tfidf = TruncatedSVD(n_components=n_comp)
sample_nmf_tfidf = NMF(n_components=n_comp)

#lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
sample_nmf_tfidf_data = sample_nmf_tfidf.fit_transform(sample_tfidf_data)


In [24]:
filename = '/home/ubuntu/proj4/data/sample_30000_nmf_200_tfidf_data_5000.pkl'
pickle.dump((sample_nmf_tfidf, sample_nmf_tfidf_data), open(filename, 'wb'))


In [65]:
sample_lsa_tfidf = TruncatedSVD(n_components=n_comp)
sample_lsa_tfidf_data = sample_lsa_tfidf.fit_transform(sample_tfidf_data)

In [71]:
filename = '/home/ubuntu/proj4/data/sample_30000_lsa_200_tfidf_data_5000.pkl'
pickle.dump((sample_lsa_tfidf, sample_lsa_tfidf_data), open(filename, 'wb'))


In [83]:
from sklearn.decomposition import LatentDirichletAllocation

n_iter = 10
sample_lda_tfidf = LatentDirichletAllocation(n_topics=n_comp,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')
sample_lda_tfidf_data = sample_lda_tfidf.fit_transform(sample_tfidf_data)





In [100]:
filename = '/home/ubuntu/proj4/data/sample_30000_lda_200_tfidf_data_5000.pkl'
pickle.dump((sample_lda_tfidf, sample_lda_tfidf_data), open(filename, 'wb'))

In [102]:
#sample_lda_tfidf.transform(sample_tfidf_data[100:102,])

In [19]:
sample_nmf_tfidf_data.size

6000000

In [20]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [21]:
display_topics(sample_nmf_tfidf,sample_tfidf_vectorizer.get_feature_names(),10)


Topic  0
know, tell, come, friend, time, great, know know, understand, want know, like

Topic  1
united states, states, united, president united, states government, people united, citizen united, person, states america, jurisdiction

Topic  2
assistant, assistant secretary, affairs, deputy, assistant president, deputy assistant, special assistant, special, serve, legislative

Topic  3
program, assistance, development, grant, assistance program, training, provide, increase, develop, area

Topic  4
problem, present, conference, question, make, time, power, matter, action, great

Topic  5
administration support, administration, statement administration, administration policy, statement, enactment, support, policy, policy administration, passage

Topic  6
house representatives, representatives, house, senate house, letter, copy, communicate, andrew, subject, transmit

Topic  7
funding available, federal, share basis, cost share, fema, federal funding, funding, recovery, damage, storm

Top

In [22]:
# returns the index of topic(as in list of total topics) for given doc
def get_topic_index(doc,vectorizer, topic_model):
#lsa_tfidf.transform(tfidf_vectorizer.transform([docs[101456]])).argmax(axis=1)
    return topic_model.transform(vectorizer.transform([doc])).argmax(axis=1)[0]


#get_topic_index(docs[73159], tfidf_vectorizer, lsa_tfidf)

In [104]:


sample_president_docs={}
for president in presidents:
    sample_president_docs[president]=[]
for i in range(len(sample_docs)):
    if sample_authors[i] in presidents:
        sample_president_docs[sample_authors[i]].append(sample_docs[i])

In [58]:
sample_president_docs.keys()

dict_keys(['Donald J. Trump', 'Richard Nixon', 'John Adams', 'Lyndon B. Johnson', 'Dwight D. Eisenhower', 'Woodrow Wilson', 'Abraham Lincoln', 'John F. Kennedy', 'George W. Bush', 'Grover Cleveland', 'William J. Clinton', 'Franklin D. Roosevelt', 'James Madison', 'Ulysses S. Grant', 'Barack Obama', 'William Howard Taft', 'Andrew Jackson', 'George Bush', 'Ronald Reagan', 'Harry S. Truman', 'Gerald R. Ford', 'Herbert Hoover', 'George Washington', 'William Henry Harrison', 'Jimmy Carter', 'Warren G. Harding', 'Thomas Jefferson', 'Theodore Roosevelt'])

In [59]:
num_topics = 200
sample_president_topic_vecs=[]

for i in range(len(presidents)):
    sample_president_topic_vec = [0]*num_topics
    for doc in sample_president_docs[presidents[i]]:
        topic_label = get_topic_index(doc, sample_tfidf_vectorizer, sample_nmf_tfidf)
        sample_president_topic_vec[topic_label] = sample_president_topic_vec[topic_label]+1
    sample_president_topic_vecs.append(sample_president_topic_vec)


In [60]:
sample_president_topic_vecs[0][:10]

[35, 0, 12, 0, 11, 0, 0, 42, 2, 0]

In [61]:
sample_president_topic_vecs[1][:10]

[88, 0, 1, 46, 9, 0, 0, 2, 1, 9]

In [64]:
filename = '/home/ubuntu/proj4/data/sample_30000_nmf_200_tfidf_president_topics_vecs.pkl'
pickle.dump(sample_president_topic_vecs, open(filename, 'wb'))

In [63]:
filename = '/home/ubuntu/proj4/data/sample_30000_nmf_200_tfidf_sample_indices_.pkl'
pickle.dump(sample_docs_indices, open(filename, 'wb'))

In [106]:
filename = '/home/ubuntu/proj4/data/sample_30000_nmf_200_tfidf_sample_authors_docs_.pkl'
pickle.dump((sample_authors,sample_docs), open(filename, 'wb'))

In [32]:
def normalize_vec(vec):
    
    normalized_vec = vec
    sum_vec = sum(vec)
    
    for i in range(len(vec)):
        normalized_vec[i]=vec[i]/sum_vec
        
    return normalized_vec

In [35]:
normalized_sample_president_topic_vecs=[]
for i in range(len(sample_president_topic_vecs)):
    normalized_sample_president_topic_vecs.append(normalize_vec(sample_president_topic_vecs[i]))


In [None]:
len(set(normalized_sample_president_topic_vecs))