<a href="https://colab.research.google.com/github/kelvinfoo123/Natural-Language-Processing/blob/main/Topic_Modelling_on_Trip_Advisor_Reviews_(LDA_and_Gensim).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

import re 
import gensim 
import spacy 
from gensim.models import CoherenceModel 
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

import nltk
from nltk.corpus import stopwords 

In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 
                   'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 
                   'right', 'line', 'even', 'also', 'may', 'take', 'come'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
df = pd.read_csv("review.csv")
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [9]:
print("Number of reviews: ", df.shape[0])

Number of reviews:  20491


In [10]:
# Just consider 5000 reviews due to time constraints 
df = df[0:5000]

## **Tokenize and clean sentences**

In [11]:
def tokenize(sentence): 
  for sent in sentence: 
    sent = re.sub('\S*@\S*\s?', '', sent) # remove emails 
    sent = re.sub('\s+', '', sent) # remove newline chars 
    sent = re.sub("\'", '', sent) # remove single quotes 
    sent = gensim.utils.simple_preprocess(str(sent), deacc = True) # Convert into a list of tokens 
    yield(sent)

In [16]:
data = df.Review.values.tolist()
data_words = list(tokenize(data))
print(data_words[:3]) # Tokens for first 3 reviews 

[['checkquickeasy', 'parkingnight'], ['gotkidding', 'hours', 'bedscomfortable', 'notgoodac', 'heatcontrol', 'thisnot'], ['niceroomsnot', 'level', 'missed', 'nightgot', 'gotroomnightno', 'stdropdesk', 'askedwakeup', 'calledlater']]


## **Lemmatize and build bigram, trigram models**

- min_count ignore all words and bigrams with total collected count lower than this.
- threshold represents a threshold for forming the phrases (higher means fewer phrases). A phrase of words a and b is accepted if (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold, where N is the total vocabulary size.

In [18]:
bigram = gensim.models.Phrases(data_words, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[data_words], threshold = 100)



In [21]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [28]:
nlp = spacy.load('en_core_web_sm')

def process(text, stop_words = stop_words, allowed_posttags = ['NOUN', 'ADJ', 'VERB', 'ADV']): 
  text = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]
  text = [bigram_mod[doc] for doc in text]
  text = [trigram_mod[bigram_mod[doc]] for doc in text]

  text_out = []
  for sent in text: 
    doc = nlp(" ".join(sent))
    text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_posttags])

  text_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_out]
  return text_out 

In [29]:
data_ready = process(data_words)

## **Latent Dirichlet Allocation**

In [30]:
# Create dictionary 
dictionary = corpora.Dictionary(data_ready)

In [32]:
# Create term document frequency 
corpus = [dictionary.doc2bow(text) for text in data_ready]

In [33]:
# Build LDA model 
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [34]:
from pprint import pprint 
pprint(lda_model.print_topics())

[(0,
  '0.024*"star" + 0.019*"room" + 0.010*"starhotel" + 0.008*"spend" + '
  '0.008*"lovedplace" + 0.008*"min" + 0.008*"time" + 0.006*"highlyrecommend" + '
  '0.005*"morning" + 0.005*"travel"'),
 (1,
  '0.013*"hotel" + 0.012*"love" + 0.011*"rd" + 0.009*"com" + '
  '0.009*"stafffriendly" + 0.009*"check" + 0.007*"defintelystay" + '
  '0.006*"nightstay" + 0.005*"excellent" + 0.005*"fun"'),
 (2,
  '0.016*"euro" + 0.008*"minutewalkaway" + 0.007*"enjoy" + 0.007*"pay" + '
  '0.007*"euros" + 0.007*"arrive" + 0.006*"nicehotel" + 0.006*"breakfast" + '
  '0.005*"roomservice" + 0.005*"hour"'),
 (3,
  '0.043*"night" + 0.029*"stay" + 0.013*"perfectlocation" + 0.012*"minute" + '
  '0.012*"day" + 0.009*"wonderful" + 0.009*"fabulous" + 0.009*"great" + '
  '0.009*"pm" + 0.008*"definitelystay"')]


## **Visualization of topic**

In [39]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis 
pyLDAvis.enable_notebook()

In [40]:
vis = gensimvis.prepare(lda_model, corpus, dictionary = lda_model.id2word)
vis

  default_term_info = default_term_info.sort_values(
