In [6]:
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer
import gensim 
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
df = pd.read_csv('/Users/kelvinfoo/Desktop/AI Masters/TripAdvisor NLP/Data/cleaned_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Review,Rating,cleaned_reviews
0,0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...
1,1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...
2,2,nice rooms not 4* experience hotel monaco seat...,3,nice room not experi hotel monaco seattl good ...
3,3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...
4,4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...


In [3]:
# Seperate positive and negative reviews 
df_positive = df[df['Rating'] >= 4]
df_negative = df[df['Rating'] <= 2]

### **Topic Modelling with LDA**

In [9]:
def compute_topic_modelling_metrics(dictionary, corpus, texts, start, end, step): 
    results = []
    for num_topics in range(start, end, step): 
        model = LdaModel(corpus = corpus, id2word = dictionary, num_topics = num_topics, random_state = 42, passes = 10)

        # Coherence score 
        coherence_model = CoherenceModel(model = model, texts = texts, dictionary = dictionary, coherence = 'c_v')
        coherence = coherence_model.get_coherence()

        # Perplexity score 
        perplexity = model.log_perplexity(corpus)

        print(f"{num_topics} topics: Coherence score - {coherence} and Perplexity score - {perplexity}")

#### **Topic Modelling for Positive Reviews**

In [10]:
pos_texts = df_positive['cleaned_reviews'].apply(lambda x: x.split()).to_list()
pos_dictionary = corpora.Dictionary(pos_texts)
pos_corpus = [pos_dictionary.doc2bow(text) for text in pos_texts]

compute_topic_modelling_metrics(pos_dictionary, pos_corpus, pos_texts, start = 3, end = 8, step = 1)

3 topics: Coherence score - 0.34409222247314536 and Perplexity score - -7.368722172602582
4 topics: Coherence score - 0.35144529008082703 and Perplexity score - -7.373263824580469
5 topics: Coherence score - 0.37443402627450795 and Perplexity score - -7.394977451556303
6 topics: Coherence score - 0.3757686300027718 and Perplexity score - -7.4339700275062395
7 topics: Coherence score - 0.41631480666759 and Perplexity score - -7.491479519979808


In [13]:
model = LdaModel(corpus = pos_corpus, id2word = pos_dictionary, num_topics = 3, random_state = 42, passes = 10)
lda_vis = gensimvis.prepare(model, pos_corpus, pos_dictionary)
pyLDAvis.display(lda_vis)

In [14]:
def print_unique_tokens_per_topic(lda_model, topn=10):
    # Extract top N words for each topic
    topic_words = []
    for topic_id in range(lda_model.num_topics):
        words = set([word for word, _ in lda_model.show_topic(topic_id, topn=topn)])
        topic_words.append(words)

    # Compute and print unique words per topic
    for topic_id, words in enumerate(topic_words):
        other_words = set().union(*[w for idx, w in enumerate(topic_words) if idx != topic_id])
        unique_words = words - other_words
        print(f"Topic {topic_id + 1}: Unique Tokens: {unique_words if unique_words else 'None'}")


In [15]:
print_unique_tokens_per_topic(model, topn = 15)

Topic 1: Unique Tokens: {'peopl', 'like', 'beach', 'day', 'food', 'resort', 'restaur', 'no', 'time', 'pool'}
Topic 2: Unique Tokens: {'nice', 'bed', 'breakfast', 'night', 'walk'}
Topic 3: Unique Tokens: {'excel', 'place', 'recommend', 'help', 'friendli', 'love', 'wonder', 'servic'}


#### **Topic Modelling for Negative Reviews**

In [16]:
neg_texts = df_negative['cleaned_reviews'].apply(lambda x: x.split()).to_list()
neg_dictionary = corpora.Dictionary(neg_texts)
neg_corpus = [neg_dictionary.doc2bow(text) for text in neg_texts]

compute_topic_modelling_metrics(neg_dictionary, neg_corpus, neg_texts, start = 3, end = 8, step = 1)

3 topics: Coherence score - 0.2994341887986626 and Perplexity score - -7.415914341287699
4 topics: Coherence score - 0.3106671701083762 and Perplexity score - -7.412289163013068
5 topics: Coherence score - 0.36497168931839163 and Perplexity score - -7.416993739326708
6 topics: Coherence score - 0.3260072555414762 and Perplexity score - -7.420869275433381
7 topics: Coherence score - 0.3189298965416612 and Perplexity score - -7.431582343939095


In [19]:
model = LdaModel(corpus = neg_corpus, id2word = neg_dictionary, num_topics = 3, random_state = 42, passes = 10)
lda_vis = gensimvis.prepare(model, neg_corpus, neg_dictionary)
pyLDAvis.display(lda_vis)

In [20]:
print_unique_tokens_per_topic(model, topn = 15)

Topic 1: Unique Tokens: {'beach', 'food', 'restaur', 'resort', 'pool'}
Topic 2: Unique Tokens: {'locat', 'price', 'star', 'place', 'bathroom', 'small'}
Topic 3: Unique Tokens: {'check', 'staff', 'book'}
