In [None]:
# Version 1

In [None]:
#the module 'sys' allows istalling module from inside Jupyter
import sys

!{sys.executable} -m pip install numpy
import numpy as np

!{sys.executable} -m pip install pandas
import pandas as pd

#Natrual Language ToolKit (NLTK)
!{sys.executable} -m pip install nltk
import nltk

!{sys.executable} -m pip install sklearn
from sklearn import metrics
#from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import  CountVectorizer #bag-of-words vectorizer 
from sklearn.decomposition import LatentDirichletAllocation #package for LDA

# Plotting tools

from pprint import pprint
!{sys.executable} -m pip install pyLDAvis #visualizing LDA
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt
%matplotlib inline

#define text normalization function
%run ./Text_Normalization_Function.ipynb #defining text normalization function

#ignore warnings about future changes in functions as they take too much space
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Dataset Preparation
data = pd.read_csv('/Users/Mercer/Desktop/origin_data.csv')
data.head(5)

In [None]:
dataslice = data[["asin","reviewText","overall"]]

In [None]:
index_tem = range(len(dataslice))

In [None]:
corpus_overall_1_tem = []
for i in index_tem:
    if dataslice.loc[i,"overall"] == 1: # rating
        tem = dataslice.loc[i,"reviewText"]
        corpus_overall_1_tem.append(tem)

index = range(len(corpus_overall_1_tem))
corpus_overall_1 = []
for ii in index:
    if type(corpus_overall_1_tem[ii]) == str:
        corpus_overall_1.append(corpus_overall_1_tem[ii])
len(corpus_overall_1)


#End of dataset preparation 

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def get_topic_words(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_words = []
    for topic_weights in lda_model.components_:
        top_word_locs = (-topic_weights).argsort()[:n_words]
        topic_words.append(keywords.take(top_word_locs).tolist())
    return topic_words

In [None]:
normalized_corpus_overall_1 = normalize_corpus(corpus_overall_1) 

In [None]:
#define the bag-of-words vectorizer:
bow_vectorizer = CountVectorizer()

#vectorize the normalized data:
bow_corpus_overall_1 = bow_vectorizer.fit_transform(normalized_corpus_overall_1)

In [None]:
pd.DataFrame(data = bow_corpus_overall_1.todense(), columns = bow_vectorizer.get_feature_names())

In [None]:
lda_corpus_overall_1 = LatentDirichletAllocation(n_components=2, max_iter=500,
                                           doc_topic_prior = 0.9,
                                           topic_word_prior = 0.9).fit(bow_corpus_overall_1)

In [None]:
no_top_words = 50
display_topics(lda_corpus_overall_1, bow_vectorizer.get_feature_names(), no_top_words)

In [None]:
word_weights = lda_corpus_overall_1.components_ / lda_corpus_overall_1.components_.sum(axis=1)[:, np.newaxis]
pd.DataFrame(word_weights.T, index = bow_vectorizer.get_feature_names()).T

In [None]:
word_weights = lda_corpus_overall_1.components_ / lda_corpus_overall_1.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(2)])
word_weights_df.head(10)

In [None]:
word_weights_df.sort_values(by='Topic_0',ascending=False).head(10)

In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_corpus_overall_1, bow_corpus_overall_1, bow_vectorizer, mds='tsne')

In [None]:
lda_corpus_overall_1_3_topics = LatentDirichletAllocation(n_components=3, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_corpus_overall_1)


In [None]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_corpus_overall_1_3_topics, bow_corpus_overall_1, bow_vectorizer, mds='tsne')

In [None]:
lda_corpus_overall_1_topic_weights = lda_corpus_overall_1.transform(bow_corpus_overall_1)

In [None]:
#array of document "names" and topic "names" ("names" are just indecies)
doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus_overall_1))]
topic_names = ["Topic_" + str(i) for i in range(2)]

#convert to dataframe
df_document_topic = pd.DataFrame(np.round(lda_corpus_overall_1_topic_weights, 4), columns=topic_names, index=doc_names)
df_document_topic.head(5)

In [None]:
#vector of indecies for columns with the highest value by each row in df_document_topic
dominant_topic = np.argmax(df_document_topic.values, axis=1)

#add dominant_topic as a column to df_document_topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic.head(5)

In [None]:
!{sys.executable} -m pip install gensim
import gensim

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [None]:
#tokenizing the corpus
corpus_overall_1_tokenized = [tokenize_text(normalized_corpus_overall_1[doc_id]) for doc_id in range(len(normalized_corpus_overall_1))]

#Dictionary of the corpus:
news_dictionary = Dictionary(corpus_overall_1_tokenized)

#Bag-of-words representation for each document of the corpus:
corpus_overall_1_bow = [news_dictionary.doc2bow(doc) for doc in corpus_overall_1_tokenized]

#top 20 words for each topic (using the function defined in session prep)
topic_topwords = get_topic_words(vectorizer = bow_vectorizer, lda_model = lda_corpus_overall_1, n_words=20)

In [None]:
cm = CoherenceModel(topics=topic_topwords, 
                    corpus = corpus_overall_1_bow , 
                    dictionary = news_dictionary, coherence='u_mass')
print("Coherence score for the model: ", np.round(cm.get_coherence(), 4))  # get coherence value

In [None]:
print("Coherence score by topic (higher values are better): ", np.round(cm.get_coherence_per_topic(),4))

In [None]:
print("Log-Likelihood (higher values are better): ", lda_corpus_overall_1.score(bow_corpus_overall_1))

In [None]:
print("Perplexity (lower values are better): ", lda_corpus_overall_1.perplexity(bow_corpus_overall_1))