In [26]:
#install packages
#!python -m pip install -U nltk
#!python -m pip install -U spacy
#!python -m spacy download en_core_web_sm
#!pip install pyLDAvis==3.2.2

# for text preprocessing
import re
import spacy
import pandas as pd

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

import pyLDAvis.sklearn
import os


# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
nlp = spacy.load('en_core_web_sm')

In [28]:
df = pd.read_csv('cleaned_data.csv')
df.columns

Index(['Unnamed: 0', 'review_id', 'business_id', 'stars', 'text',
       'clean_text'],
      dtype='object')

In [29]:
# combining all the documents into a list:
from nltk.tokenize import word_tokenize

corpus =  df['clean_text'].apply(word_tokenize)

# Convert Text into Numerical Representation

In [36]:
# Converting text into numerical representation
n_components = 5
n_top_words = 20

tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

# Converting text into numerical representation. bagofwords
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [37]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(corpus)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(corpus)

In [38]:
# this is our converted text to numerical representation from the Tf-IDF vectorizer
tf_idf_arr

<351x3345 sparse matrix of type '<class 'numpy.float64'>'
	with 19396 stored elements in Compressed Sparse Row format>

In [39]:
# this is our converted text to numerical representation from the Count vectorizer
cv_arr

<351x3345 sparse matrix of type '<class 'numpy.int64'>'
	with 19396 stored elements in Compressed Sparse Row format>

In [40]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf[0:20]

['06',
 '1',
 '10',
 '100',
 '101',
 '1010',
 '1015',
 '1030',
 '10min',
 '10pm',
 '11',
 '111215',
 '12',
 '120',
 '127',
 '13',
 '130',
 '14',
 '145',
 '15']

In [41]:
# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocb list
vocab_cv[0:20]

['06',
 '1',
 '10',
 '100',
 '101',
 '1010',
 '1015',
 '1030',
 '10min',
 '10pm',
 '11',
 '111215',
 '12',
 '120',
 '127',
 '13',
 '130',
 '14',
 '145',
 '15']

In [42]:
display(len(vocab_tf_idf))
display(len(vocab_cv))

3345

3345

# Implementation of LDA

In [73]:
 # Implementation of LDA: To implement LDA, pass the corpus: document-term matrix to the model. 
# We had above obtained the unique words of vocabulary using both TF-IDF and Count Vectorizer. We can continue with either as have the same unique words in both the obtained vocabularies.
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 10, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

In [74]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 15

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['wonder' 'pasqual' 'sweet' 'pastri' 'courtesi' 'sampler' 'person' 'top'
 'thank' 'werent' 'goto' 'cafe' 'fondu' 'tea']
Topic 2 ['wonder' 'world' 'flavorless' 'tip' 'suck' 'lasagn' 'ideal' 'fancier'
 'bravissimo' 'antipasti' 'highli' 'stuf' 'sidewalk' 'beauti']
Topic 3 ['food' 'good' 'great' 'servic' 'place' 'restaur' 'order' 'pasta'
 'italian' 'love' 'tabl' 'one' 'menu' 'nice']
Topic 4 ['wonder' 'mood' 'consist' 'potion' 'lasagna' 'voucher' 'tuesday'
 'eggplant' 'addit' 'thank' 'owner' 'mom' 'buffet' 'quantiti']
Topic 5 ['spectacular' 'salti' 'usual' 'solid' 'lightli' 'sister' 'chill' 'knew'
 'fettuccin' 'justifi' 'add' 'alfredo' 'call' 'fix']
Topic 6 ['closest' 'itali' 'tea' 'joe' 'ambienc' 'name' 'rodent' 'gather' 'call'
 'wouldnt' 'piec' 'graduat' 'five' 'forget']
Topic 7 ['wonder' 'appl' 'lasagna' 'beauti' 'addit' 'believ' 'creat' 'pesto'
 'whole' 'ownership' 'quickli' 'treasur' 'person' 'old']
Topic 8 ['conveni' 'call' 'popular' 'tea' 'ten' 'stuf' 'ad' 'establish' 'yogurt

# Measure Performance

In [45]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(tf_idf_arr))

Log Likelihood:  -20335.209594866752


In [47]:
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)

print("Perplexity: ", lda_model.perplexity(tf_idf_arr))

print(lda_model.get_params())




Perplexity:  8006.357309421285
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'batch', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 20, 'mean_change_tol': 0.001, 'n_components': 5, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': 20, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


# Find the best topic model and its parameters

In [53]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf_idf_arr)

GridSearchCV(cv=None, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 tota

In [68]:

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf_idf_arr))


Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -7167.163540588184
Model Perplexity:  25076.317889019807


# Visualizing the data 

In [75]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, tf_idf_arr, tfidf_vectorizer, mds='tsne',R = 20)
panel

# Finding the dominant topic for each doc and labelling docs

In [61]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  
print(doc_topic)


[[0.02821179 0.02821075 0.8871556  0.02821098 0.02821088]
 [0.01746981 0.0174713  0.93010308 0.0174697  0.01748611]
 [0.03081084 0.03081068 0.87675683 0.03081109 0.03081055]
 ...
 [0.02003747 0.0200374  0.91985023 0.02003768 0.02003723]
 [0.01637581 0.01637551 0.93449719 0.0163763  0.01637519]
 [0.02813372 0.0281336  0.88746529 0.0281339  0.0281335 ]]


In [64]:

# iterating over every value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    #print ("Document", n+1, " -- Topic:" ,topic_doc)
    
df['topic'] = topic_doc

In [65]:
topic_dict = {0:'Service', 1:'Food',2:'Ambience',
             3:'Wait time',4:'Food',5:'Food',6:'Food',7:'Food'}

df = df.replace({'topic':topic_dict})

In [None]:
df.head(10)

In [66]:
df.to_csv("sklearn_labelled_data.csv")