In [28]:
import mysql.connector
import pandas as pd
import os
from sqlalchemy import create_engine
import re
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import yaml
import nltk
from textblob import TextBlob

In [7]:
with open('../data/db_info.yml', 'r') as file:
    data = yaml.safe_load(file)

db_host = data["DATABASE_ENDPOINT"]
db_user = data["DATABASE_USERNAME"]
db_pw = data["DATABASE_PASSWORD"]
db_name = data["DATABASE_NAME"]
db_port = data["DATABASE_PORT"]

dwh_host = data["DATAWH_ENDPOINT"]
dwh_user = data["DATAWH_USERNAME"]
dwh_pw = data["DATAWH_PASSWORD"]
dwh_name = data["DATAWH_NAME"]
dwh_port = data["DATAWH_PORT"]

db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)


In [6]:
engine = create_engine(f'mysql+pymysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False)

dwh = engine.connect()

In [8]:
review_Sql = '''
SELECT * FROM review ORDER BY OverallID
'''
df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)

df

  df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)


Unnamed: 0,index,ReviewID,ReviewTitle,ReviewText,DateOfStay,AuthorContribution,Rating,OverallID,CleanReviewTitle,CleanReviewText,TextBlob_Title,TextBlob_Review
0,0,1f504b36-835,Must see in Singapore,A must not miss place for tourists to visit wh...,2024-03-01,73,5,1,must see in singapore,a must not miss place for tourists to visit wh...,0.000,0.266667
1,1,8f5e71d9-ee0,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,2024-03-01,3,5,2,i recently had the most rejuvenating spa,i recently had the most rejuvenating spa exper...,0.250,0.357143
2,2,08156b89-c28,Professional service,Visited the hotel for some drinks and what I r...,2024-03-01,2,5,3,professional service,visited the hotel for some drinks and what i r...,0.100,0.450833
3,3,c0c6d6e4-432,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,2024-03-01,1,5,4,marina bay world class,amazing hotel and loved the facilities being s...,0.000,0.650000
4,4,cf4da540-6e0,Nice touch.,While the initial check in experience was not ...,2024-03-01,2,5,5,nice touch,while the initial check in experience was not ...,0.600,0.227857
...,...,...,...,...,...,...,...,...,...,...,...,...
11227,11227,1e80010b-5d5,A good hotel,Nice location and very near to a shopping mall...,2014-08-01,97,5,11228,a good hotel,nice location and very near to a shopping mall...,0.700,0.209479
11228,11228,57720122-2c0,Swim in heaven,Everytime I check into the Sand I love going u...,2015-03-01,13,4,11229,swim in heaven,everytime i check into the sand i love going u...,0.000,0.177381
11229,11229,a536577c-ef8,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,2015-01-01,68,5,11230,unmatcheable property,i stayed here for nights in november hotel...,0.000,0.438095
11230,11230,6f876fdb-48a,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,2015-01-01,59,5,11231,beautiful experience and amazing architecture,this is a great place to visit even quickly if...,0.725,0.338333


In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/gaoheng/nltk_data...


True

In [22]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

In [23]:
df['ProcessedText'] = df['ReviewText'].apply(preprocess_text)
df["ProcessedText"]

0        [must, miss, place, tourist, visit, singapore,...
1        [recently, rejuvenating, spa, experience, hour...
2        [visited, hotel, drink, really, wanted, say, w...
3        [amazing, hotel, loved, facility, someone, enj...
4        [initial, check, experience, expected, nicole,...
                               ...                        
11227    [nice, location, near, shopping, mall, big, sw...
11228    [everytime, check, sand, love, going, pool, en...
11229    [stayed, night, november, hotel, real, superb,...
11230    [great, place, visit, even, quickly, hour, lay...
11231    [stayed, one, night, great, time, allowed, che...
Name: ProcessedText, Length: 11232, dtype: object

In [24]:
texts = df["ProcessedText"].to_numpy()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
best_coherence = -1
best_lda = None
for num_topics in range(5, 26, 5):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=5,
                                                alpha='auto',
                                                per_word_topics=True)
    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

Number of topics: 5, Coherence Score: 0.37338312924840966
Number of topics: 10, Coherence Score: 0.36918309979364455
Number of topics: 15, Coherence Score: 0.34393120747935874
Number of topics: 20, Coherence Score: 0.3466008785233366
Number of topics: 25, Coherence Score: 0.32312251854360974
Best no of topic: 5 and Best Coherence Score: 0.37338312924840966


In [26]:
best_topic = 5
lda_model = LdaModel(corpus=corpus,
                    id2word=dictionary,
                    num_topics=best_topic,
                    random_state=42,
                    update_every=1,
                    chunksize=100,
                    passes=5,
                    alpha='auto',
                    per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis