In [1]:
import mysql.connector
import pandas as pd
import os
from sqlalchemy import create_engine
import re
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
from nltk import pos_tag

In [2]:
db_host = os.getenv("DATABASE_ENDPOINT")
db_user = os.getenv("DATABASE_USERNAME")
db_pw = os.getenv("DATABASE_PASSWORD")
db_name = os.getenv("DATABASE_NAME")
db_port = os.getenv("DATABASE_PORT")

dwh_host = os.getenv("DATAWH_ENDPOINT")
dwh_user = os.getenv("DATAWH_USERNAME")
dwh_pw = os.getenv("DATAWH_PASSWORD")
dwh_name = os.getenv("DATAWH_NAME")
dwh_port = os.getenv("DATAWH_PORT")

# housekeeping
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)


In [3]:
engine = create_engine(f'mysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False)

dwh = engine.connect()

In [4]:
review_Sql = '''
SELECT * FROM review ORDER BY OverallID
'''
df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)

df

  df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)


Unnamed: 0,index,ReviewID,DateOfStay,AuthorContribution,Rating,OverallID,CleanReviewTitle,CleanReviewText,WeightedTitleScore,WeightedTextScore
0,0,0bc5b29b-a26,2024-03-01,73,5,1,must see singapore,must miss place tourist visit singapore pretty...,0.333333,0.719524
1,1,11298eff-57d,2024-03-01,3,5,2,recently rejuvenating spa,recently rejuvenating spa experience hour flig...,0.333333,0.759911
2,2,a7be9e0d-05f,2024-03-01,2,5,3,professional service,visited hotel drink really wanted say wonderfu...,0.366667,0.778333
3,3,51f46163-eb7,2024-03-01,1,5,4,marina bay world class,amazing hotel loved facility someone enjoys gy...,0.333333,0.872200
4,4,84473656-36f,2024-03-01,2,5,5,nice touch,initial check experience expected nicole manag...,0.673833,0.715333
...,...,...,...,...,...,...,...,...,...,...
11227,11227,6deec101-4d4,2014-08-01,97,5,11228,good hotel,nice location near shopping mall big swimming ...,0.713467,0.594843
11228,11228,40d92b8c-f92,2015-03-01,13,4,11229,swim heaven,everytime check sand love going pool enjoy lon...,0.336867,0.587251
11229,11229,9760339b-2e0,2015-01-01,68,5,11230,unmatcheable property,stayed night hotel real superb beauty jewel si...,0.333333,0.777078
11230,11230,75c344fd-edf,2015-01-01,59,5,11231,beautiful experience amazing architecture,great place visit even quickly hour layover ai...,0.850700,0.801139


In [5]:
def preprocess_text(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    text = text.split()
    nouns = [word for (word, pos) in pos_tag(text) if is_noun(pos)]
    return nouns

In [7]:
df['ProcessedText'] = df['CleanReviewText'].apply(preprocess_text)
df["ProcessedText"]

0        [place, tourist, visit, place, walk, people, t...
1        [experience, hour, flight, jfk, massage, thera...
2        [hotel, service, staff, sazali, nicholas, indi...
3        [hotel, facility, someone, gym, chanuk, oddie,...
4        [check, experience, management, offer, solutio...
                               ...                        
11227    [location, mall, swimming, pool, service, snac...
11228    [check, sand, love, pool, enjoy, pool, sand, f...
11229    [night, hotel, beauty, jewel, singapore, rooft...
11230    [place, visit, hour, airport, shuttle, taxi, c...
11231    [night, time, room, layer, check, reception, c...
Name: ProcessedText, Length: 11232, dtype: object

In [9]:
pos_df = df[df["WeightedTextScore"] >= 0]
neg_df = df[df["WeightedTextScore"] < 0]

Positive Class

In [10]:
texts = pos_df["ProcessedText"].to_numpy()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

best_coherence = -1
best_lda = None
for num_topics in range(1, 6):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=5,
                                                alpha='auto',
                                                per_word_topics=True)
    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

Number of topics: 1, Coherence Score: 0.39327521752087435
Number of topics: 2, Coherence Score: 0.45033229173573747
Number of topics: 3, Coherence Score: 0.45260813802580396
Number of topics: 4, Coherence Score: 0.37163069924626346
Number of topics: 5, Coherence Score: 0.3383550657636355
Best no of topic: 3 and Best Coherence Score: 0.45260813802580396


In [11]:
lda_model = LdaModel(corpus=corpus,
                    id2word=dictionary,
                    num_topics=best_topic,
                    random_state=42,
                    update_every=1,
                    chunksize=100,
                    passes=5,
                    alpha='auto',
                    per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

Negative Class

In [13]:
texts = neg_df["ProcessedText"].to_numpy()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

best_coherence = -1
best_lda = None
for num_topics in range(1, 10):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=5,
                                                alpha='auto',
                                                per_word_topics=True)
    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

Number of topics: 1, Coherence Score: 0.3397387210464569
Number of topics: 2, Coherence Score: 0.3541933847437697
Number of topics: 3, Coherence Score: 0.366942586961511
Number of topics: 4, Coherence Score: 0.3445709769810961
Number of topics: 5, Coherence Score: 0.4012635139034738
Number of topics: 6, Coherence Score: 0.3884080223318213
Number of topics: 7, Coherence Score: 0.43608676601706203
Number of topics: 8, Coherence Score: 0.42578424501106005
Number of topics: 9, Coherence Score: 0.36766230203645617
Best no of topic: 7 and Best Coherence Score: 0.43608676601706203


In [14]:
lda_model = LdaModel(corpus=corpus,
                    id2word=dictionary,
                    num_topics=best_topic,
                    random_state=42,
                    update_every=1,
                    chunksize=100,
                    passes=5,
                    alpha='auto',
                    per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis