In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import pickle

In [8]:
with open('preprocess_data.pickle', mode='rb') as f:
    data = pickle.load(f)

In [9]:
vectorizer = CountVectorizer(stop_words='english', )
doc_word = vectorizer.fit_transform(data.comment_text)

In [10]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(6)
doc_topic = lsa.fit_transform(doc_word)

In [None]:
CountVectorizer

In [22]:
doc_topic

array([[ 0.54835222, -0.09420743, -0.19779496, -0.49278545,  0.73468204,
        -0.17121666],
       [ 0.37868436, -0.09196782, -0.1179289 , -0.04671575, -0.13856657,
        -0.08015258],
       [ 0.0746036 , -0.02692589, -0.01835017,  0.01356787, -0.01372297,
        -0.00953306],
       ...,
       [ 0.17940367, -0.00383929, -0.04304589, -0.06485946, -0.11664095,
        -0.09302545],
       [ 0.12072848,  0.00921115, -0.02272837,  0.00190658, -0.04602469,
        -0.05118958],
       [ 0.70836077, -0.21339635, -0.33750188, -0.2828425 , -0.34001704,
         0.65461358]])

In [23]:
#dir(lsa)

In [19]:
lsa.components_

array([[ 6.60612775e-04,  1.70666869e-04,  2.77136745e-06, ...,
         1.72826514e-06,  3.63400046e-06,  1.91586500e-06],
       [-6.88591870e-05, -5.32831559e-05,  1.32182161e-06, ...,
         7.05179456e-06,  1.39604938e-05,  7.05624036e-06],
       [-2.50090174e-04, -1.87614789e-04, -2.79858270e-06, ...,
         6.15579057e-07,  1.26264871e-06,  4.73239973e-07],
       [-1.66595359e-04, -2.35460793e-05,  8.83627033e-07, ...,
         1.26704428e-07,  5.83307035e-07,  1.49094487e-08],
       [-2.76442550e-04, -9.97296788e-06, -3.78539164e-06, ...,
         2.05043582e-06,  4.90828566e-06,  1.74812723e-06],
       [-3.38234812e-04, -7.10224692e-05, -4.40770104e-06, ...,
        -1.45465775e-06, -2.22447709e-06, -1.71069640e-06]])

In [12]:
topic_word = pd.DataFrame(lsa.components_.round(5),
             index = ["component_1","component_2", "component_3", "component_4", "component_5", "component_6"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaa,aaaaaaaaa,aaaaaaaaaa,aaaaaaaaaaa,aaaaaaaaaaaaaa,...,𝙬𝙖𝙨,𝙬𝙖𝙩𝙚𝙧𝙨,𝙬𝙚𝙧𝙚,𝙬𝙞𝙡𝙡,𝙬𝙤𝙧𝙠𝙞𝙣𝙜,𝙬𝙤𝙧𝙡𝙙,𝙬𝙧𝙤𝙣𝙜,𝙮𝙚𝙖𝙧𝙨,𝙮𝙤𝙪,𝙮𝙤𝙪𝙧
component_1,0.00066,0.00017,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-7e-05,-5e-05,0.0,-0.0,-0.0,6e-05,0.0,-0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,1e-05,-0.0,-0.0,-0.0,1e-05,1e-05,1e-05
component_3,-0.00025,-0.00019,-0.0,0.0,-0.0,1e-05,-0.0,-0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
component_4,-0.00017,-2e-05,0.0,0.0,-0.0,2e-05,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0
component_5,-0.00028,-1e-05,-0.0,-0.0,-0.0,1e-05,-0.0,0.0,-0.0,0.0,...,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0
component_6,-0.00034,-7e-05,-0.0,-0.0,-0.0,1e-05,-0.0,-0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [13]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [14]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
people, trump, like, just, did, does, time, think, know, right

Topic  1
trump, president, obama, donald, clinton, hillary, election, did, media, news

Topic  2
people, trump, white, black, president, donald, racist, homeless, hate, guns

Topic  3
tax, state, government, oil, pay, money, trump, income, taxes, alaska

Topic  4
like, tax, trump, oil, income, state, pay, taxes, people, sounds

Topic  5
just, tax, trump, people, income, pay, taxes, oil, money, sales


## NMF

In [None]:
vectorizer = CountVectorizer(stop_words = 'english')
doc_word = vectorizer.fit_transform(data.comment_text)
pd.DataFrame(doc_word.toarray(), index=ex_label, columns=vectorizer.get_feature_names()).head(10)

In [None]:
nmf_model = NMF(6)
doc_topic = nmf_model.fit_transform(doc_word)

In [None]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3", "component_4", "component_5", "component_6"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import random
# data processing, CSV file I/O (e.g. pd.read_csv)

from nltk.tokenize import TweetTokenizer,sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb
from sklearn import metrics
import os
import torch
import warnings 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, add
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks,Sequential
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam


import gensim 
from gensim.models import Word2Vec