In [1]:
from vox_articles_to_db import create_db 

from numpy.random import rand, RandomState
from numpy import array, matrix, linalg
from scipy.spatial.distance import pdist, squareform
from scipy import spatial
import pandas as pd
import numpy as np
import math

from sklearn.decomposition import TruncatedSVD, NMF

#Stemming and Lemmatizing packages
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize   
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
import Stemmer

In [2]:
english_stemmer = Stemmer.Stemmer('en')
snowball = SnowballStemmer('english')      

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

In [3]:
vox_raw_df = create_db()
vox_df = vox_raw_df[vox_raw_df[1]!='']
vox_df.shape
doc_bodies = vox_df[1]

In [200]:
def reconst_mse(target, left, right):
    return (array(target - left.dot(right))**2).mean()

def describe_nmf_results(document_term_mat, W, H, n_top_words = 20):
    
    print("Reconstruction error: %f") %(reconst_mse(document_term_mat, W, H))
    topics = []
    for topic_num, topic in enumerate(H):
        curr_topic = [feature_words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(curr_topic)
        print("Topic %d:" % topic_num)
        print(" ".join(curr_topic))   
    return topics


def calc_cosine_similarity(word_matrix, topic_vector):
    cosine_similarities = [1 - spatial.distance.cosine(article.todense(), topic_vector) for article in word_matrix]
    return cosine_similarities

def calc_average_cosine_similarity(H_matrix):
    print H_matrix.shape
    h_2 = H_matrix
    indices = [0,1,2,3,4]
    similarities = [calc_cosine_similarity_topics(h_2, primary_topic, index) for primary_topic, index in zip(H_matrix, indices)]
    averages = [reduce(lambda x, y: x + y, topic) / len(topic) for topic in similarities]
    return averages

def calc_cosine_similarity_topics(H_matrix, primary_topic, index):
    print primary_topic.shape
    results = [1 - spatial.distance.cosine(primary_topic, topic) for topic in H_matrix]
    results.pop(index)
    print results
    return results

def convert_corpus_to_matrix(doc_bodies, n_features = 5000):
    vectorizer = StemmedTfidfVectorizer(max_features=n_features, stop_words='english', analyzer='word', ngram_range=(1,1))
    document_term_mat = vectorizer.fit_transform(doc_bodies)
    return vectorizer, document_term_mat

def calculate_word_dictionary(vectorizer):
    word_dictionary  = {}
    for index, word in enumerate(vectorizer.get_feature_names()):
        word_dictionary[word] = index 
    return word_dictionary

def generate_latent_topics(subset_matrix, tolerance, n_components = 5):
    print("\n\n---------\nsklearn decomposition")
    disimilar = False
    while not disimilar:
        print "check"
        nmf = NMF(n_components,tol=tolerance)
        W_sklearn = nmf.fit_transform(subset_matrix)
        H_sklearn = nmf.components_
        averages = calc_average_cosine_similarity(H_sklearn)
        print averages
        disimilar = all(i <= .45 for i in averages)
        n_components -= 1  
    return W_sklearn, H_sklearn

def generate_latent_topics_2(subset_matrix, tolerance, n_components = 5):
    print("\n\n---------\nsklearn decomposition")
    nmf = NMF(n_components,tol=tolerance)
    W_sklearn = nmf.fit_transform(subset_matrix)
    H_sklearn = nmf.components_
    return W_sklearn, H_sklearn

#find indices of documents with consine similarity > threshold 
#                  *** threshold needs to be more intentional ***
def create_matrix_subset(raw_query, document_term_mat, threshold):
    vectorized_query = vectorizer.transform([raw_query])
    results = calc_cosine_similarity(document_term_mat, vectorized_query.todense())
    num_positive_values = len([result for result in results if result > threshold])
    positive_indices = np.argsort(np.abs(results))[-num_positive_values:-1]
    return document_term_mat[positive_indices]

def segment_and_categorize(query, current_matrix, tolerance, threshold= .05, n_components = 5):
    matrix_subset = create_matrix_subset(query,current_matrix, threshold)
    W_sklearn, H_sklearn = generate_latent_topics(matrix_subset, tolerance, n_components)
    topics = describe_nmf_results(matrix_subset, W_sklearn, H_sklearn)
    return matrix_subset, topics
#find indices of documents with consine similarity > threshold 
#                  *** threshold needs to be more intentional ***
def create_matrix_subset(raw_query, document_term_mat, threshold):
    vectorized_query = vectorizer.transform([raw_query])
    results = calc_cosine_similarity(document_term_mat, vectorized_query.todense())
    num_positive_values = len([result for result in results if result > threshold])
    positive_indices = np.argsort(np.abs(results))[-num_positive_values:-1]
    return document_term_mat[positive_indices]

def segment_and_categorize(query, current_matrix, tolerance, threshold= .05, n_components = 5):
    matrix_subset = create_matrix_subset(query,current_matrix, threshold)
    W_sklearn, H_sklearn = generate_latent_topics(matrix_subset, tolerance, n_components)
    topics = describe_nmf_results(matrix_subset, W_sklearn, H_sklearn)
    return matrix_subset, topics

In [173]:
tolerance = .01
vectorizer, document_term_mat = convert_corpus_to_matrix(doc_bodies)
feature_words = vectorizer.get_feature_names()
word_dictionary = calculate_word_dictionary(vectorizer)
W_sklearn, H_sklearn = generate_latent_topics(document_term_mat, tolerance)
describe_nmf_results(document_term_mat, W_sklearn, H_sklearn)



---------
sklearn decomposition
check
(5, 5000)
(5000,)
[0.31951504392442787, 0.24453949640072226, 0.13849699886340561, 0.39554392358508084]
(5000,)
[0.31951504392442787, 0.20511430042692536, 0.19228475435750148, 0.33757848111889877]
(5000,)
[0.24453949640072226, 0.20511430042692536, 0.15064274290686852, 0.24005581905307416]
(5000,)
[0.13849699886340561, 0.19228475435750148, 0.15064274290686852, 0.20710525938894864]
(5000,)
[0.39554392358508084, 0.33757848111889877, 0.24005581905307416, 0.20710525938894864]
[0.27452386569340914, 0.26362314495693839, 0.21008808969689757, 0.17213243887918106, 0.29507087078650063]
Reconstruction error: 0.000185
Topic 0:
season charact film episod movi like stori seri just book time tv peopl make thing women watch way men play
Topic 1:
republican obama democrat clinton immigr presid parti trump senat polit bush vote elect iran candid campaign polici isi poll hous
Topic 2:
polic black offic ferguson protest shoot brown kill depart report justic arrest rac

[[u'season',
  u'charact',
  u'film',
  u'episod',
  u'movi',
  u'like',
  u'stori',
  u'seri',
  u'just',
  u'book',
  u'time',
  u'tv',
  u'peopl',
  u'make',
  u'thing',
  u'women',
  u'watch',
  u'way',
  u'men',
  u'play'],
 [u'republican',
  u'obama',
  u'democrat',
  u'clinton',
  u'immigr',
  u'presid',
  u'parti',
  u'trump',
  u'senat',
  u'polit',
  u'bush',
  u'vote',
  u'elect',
  u'iran',
  u'candid',
  u'campaign',
  u'polici',
  u'isi',
  u'poll',
  u'hous'],
 [u'polic',
  u'black',
  u'offic',
  u'ferguson',
  u'protest',
  u'shoot',
  u'brown',
  u'kill',
  u'depart',
  u'report',
  u'justic',
  u'arrest',
  u'racial',
  u'white',
  u'crime',
  u'crimin',
  u'gun',
  u'peopl',
  u'forc',
  u'cop'],
 [u'court',
  u'marriag',
  u'sex',
  u'marijuana',
  u'state',
  u'suprem',
  u'law',
  u'legal',
  u'rule',
  u'feder',
  u'ban',
  u'decis',
  u'lgbt',
  u'drug',
  u'case',
  u'gay',
  u'discrimin',
  u'right',
  u'equal',
  u'coupl'],
 [u'health',
  u'percent',
  u'peo

In [201]:
main_query = 'China'
subset_matrix = document_term_mat

new_matrix, new_topics = segment_and_categorize(main_query, subset_matrix, tolerance)
print new_matrix.shape




---------
sklearn decomposition
check
(5, 5000)
(5000,)
[0.12422366705130061, 0.11505251886581225, 0.182773512588704, 0.39197911298165511]
(5000,)
[0.12422366705130061, 0.045055348037094989, 0.051310028623774984, 0.20542477057389452]
(5000,)
[0.11505251886581225, 0.045055348037094989, 0.22046071137272305, 0.31614009528789122]
(5000,)
[0.182773512588704, 0.051310028623774984, 0.22046071137272305, 0.30667965291524701]
(5000,)
[0.39197911298165511, 0.20542477057389452, 0.31614009528789122, 0.30667965291524701]
[0.20350720287186799, 0.10650345357151628, 0.17417716839088038, 0.19030597637511226, 0.30505590793967197]
Reconstruction error: 0.000138
Topic 0:
stock chines market china margin trade govern price economi invest borrow compani econom growth investor index percent money crash buy
Topic 1:
hong kong protest beij chines china democraci polic crackdown govern communist movement protestor 1989 pro demonstr unrest freedom student peac
Topic 2:
emiss climat carbon global coal warm energ

In [202]:
new_query = main_query + ' '.join(new_topics[0])
new_matrix, new_topics = segment_and_categorize(new_query, new_matrix, tolerance, threshold = .2)
print new_matrix.shape




---------
sklearn decomposition
check
(5, 5000)
(5000,)
[0.39324304130873733, 0.66757581885351702, 0.301153300270944, 0.43441995381396192]
(5000,)
[0.39324304130873733, 0.35209789101001854, 0.4036091244827491, 0.37967802025576669]
(5000,)
[0.66757581885351702, 0.35209789101001854, 0.3262168205112348, 0.30735850475594573]
(5000,)
[0.301153300270944, 0.4036091244827491, 0.3262168205112348, 0.24609207303287839]
(5000,)
[0.43441995381396192, 0.37967802025576669, 0.30735850475594573, 0.24609207303287839]
[0.4490980285617901, 0.38215701926431789, 0.41331225878267902, 0.31926782957445154, 0.34188713796463821]
Reconstruction error: 0.000027
Topic 0:
stock market chines price govern china economi share declin econom investor peopl invest borrow crash boom fall week compani percent
Topic 1:
china chines economi growth econom invest market consumpt polit model reform countri crash elit export communist driven problem transit realli
Topic 2:
stock margin chines trade market china borrow percent 

In [203]:
new_query += ' '.join(new_topics[2])
new_matrix, new_topics = segment_and_categorize(new_query, new_matrix, tolerance, threshold = .3)
print new_matrix.shape





---------
sklearn decomposition
check


KeyboardInterrupt: 

In [194]:
new_query += ' '.join(new_topics[4])
new_matrix, new_topics = segment_and_categorize(new_query, new_matrix, tolerance, threshold = .4)
print new_matrix.shape



---------
sklearn decomposition
check
(5, 5000)
(5000,)
[0.45853811071760586, 0.39543875899215042, 0.44131807699232173, 0.42641500163915846]
(5000,)
[0.45853811071760586, 0.52497786825997539, 0.51101730709077253, 0.40476792647590487]
(5000,)
[0.39543875899215042, 0.52497786825997539, 0.41434156057731408, 0.46751363039769744]
(5000,)
[0.44131807699232173, 0.51101730709077253, 0.41434156057731408, 0.37876651746795065]
(5000,)
[0.42641500163915846, 0.40476792647590487, 0.46751363039769744, 0.37876651746795065]
[0.43042748708530909, 0.47482530313606464, 0.45056795455678433, 0.43636086553208975, 0.41936576899517786]
Reconstruction error: 0.000029
Topic 0:
enrol healthcar gov obamacar coverag million sign year peopl open shopper plan insur administr select health marketplac launch consum applic
Topic 1:
insur obamacar health enrolle sign exchang predict coverag peopl care law premium data fix octob gov million up healthcar expans
Topic 2:
premium shopper insur plan marketplac health obamac

In [195]:
new_query += ' '.join(new_topics[2])
new_matrix, new_topics = segment_and_categorize(new_query, new_matrix, tolerance, threshold = .4)
print new_matrix.shape



---------
sklearn decomposition
check
(5, 5000)
(5000,)
[0.31351396065816428, nan, nan, nan]
(5000,)
[0.31351396065816428, nan, nan, nan]
(5000,)
[nan, nan, nan, nan]
(5000,)
[nan, nan, nan, nan]
(5000,)
[nan, nan, nan, nan]
[nan, nan, nan, nan, nan]
check
(4, 5000)
(5000,)
[0.31351396065816417, nan, nan]
(5000,)
[0.31351396065816417, nan, nan]
(5000,)
[nan, nan, nan]
(5000,)
[nan, nan, nan]
[nan, nan, nan, nan]
check
(3, 5000)
(5000,)
[0.31351396065816406, nan]
(5000,)
[0.31351396065816406, nan]
(5000,)
[nan, nan]
[nan, nan, nan]
check
(2, 5000)
(5000,)
[0.31351396065816428]
(5000,)
[0.31351396065816428]
[0.31351396065816428, 0.31351396065816428]
Reconstruction error: 0.000000
Topic 0:
cruz healthcar plan gov insur famili emerg shop coverag health care option doctor obamacar charg premium 000 network cost deduct
Topic 1:
shopper insur subsidi subsid marketplac competit averag premium health plan pay generous month obamacar healthcar feder report cost price low
(15, 5000)


In [12]:
def generate_latent_topics_2(subset_matrix, n_components = 5):
    print("\n\n---------\nsklearn decomposition")
    nmf = NMF(n_components)
    W_sklearn = nmf.fit_transform(subset_matrix)
    H_sklearn = nmf.components_
    return W_sklearn, H_sklearn

def calc_cosine_similarity_2(word_matrix, topic_vector):
    cosine_similarities = [1 - spatial.distance.cosine(article.todense(), topic_vector) for article in word_matrix]
    return cosine_similarities

#find indices of documents with consine similarity > threshold 
#                  *** threshold needs to be more intentional ***
def create_matrix_subset_2(raw_query, document_term_mat, threshold):
    results = calc_cosine_similarity(document_term_mat, raw_query.todense())
    num_positive_values = len([result for result in results if result > threshold])
    positive_indices = np.argsort(np.abs(results))[-num_positive_values:-1]
    return document_term_mat[:document_term_mat.shape[0]/2]

def segment_and_categorize_2(query, current_matrix, threshold= .05, n_components = 5):
    matrix_subset = create_matrix_subset_2(query,current_matrix, threshold)
    W_sklearn, H_sklearn = generate_latent_topics_2(matrix_subset, n_components)
    topics = describe_nmf_results(matrix_subset, W_sklearn, H_sklearn)
    return matrix_subset, topics

In [13]:
random_vector = document_term_mat[1]
half_mat, half_topics = segment_and_categorize_2(random_vector, document_term_mat)
print half_mat.shape

for x in xrange(1,20):
    print "\n\n Iteration ",x
    half_mat, half_topics = segment_and_categorize_2(random_vector, half_mat)
    print half_mat.shape



---------
sklearn decomposition
Reconstruction error: 0.000185
Topic 0:
film season charact episod movi stori like seri just tv book time peopl make thing watch way play women best
Topic 1:
percent health tax insur state peopl rate year student price obamacar incom colleg cost care job market compani pay plan
Topic 2:
ebola health outbreak diseas patient virus hospit infect africa vaccin care liberia doctor west peopl case epidem medic leon sierra
Topic 3:
polic offic black ferguson shoot protest brown justic kill arrest white depart report racial gun law death crimin crime peopl
Topic 4:
obama republican clinton democrat immigr trump presid isi parti senat polit bush iran vote state polici campaign candid elect support
(5343, 5000)


 Iteration  1


---------
sklearn decomposition
Reconstruction error: 0.000184
Topic 0:
season film episod charact movi stori like seri tv just book watch thing time make peopl play love men way
Topic 1:
obama republican immigr democrat clinton presid p

IndexError: index 2 is out of bounds for axis 1 with size 2

In [561]:
document_term_mat[0]

<1x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 70 stored elements in Compressed Sparse Row format>