# Unsupervised Topic Modeling for Disneyland Reviews

# Library Imports

In [1]:
# basic imports
import pandas as pd

# preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Data Loading

In [2]:
df = pd.read_csv("DisneylandReviews.csv", encoding = 'latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [3]:
# since unsupervised, only look at the text
reviews = df['Review_Text'].tolist()

# Preprocessing

In [4]:
# Download required NLTK data once
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # for WordNet lemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisfu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/chrisfu/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/chrisfu/nltk_data...


True

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [6]:
def preprocess(text):
    text = str(text).lower() # lowercase
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation
    tokens = text.split()

    # lemmatize word (i.e. convert to root form)
    # exclude stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(tokens)

    return " ".join(tokens)

# preprocess text
reviews_cleaned = [preprocess(review) for review in reviews]


# Topic Model - LDA

In [10]:
# topic model - LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim import corpora
import pyLDAvis.lda_model
import pyLDAvis

In [11]:
# convert words to bag-of-words (i.e. raw counts)
vectorizer = CountVectorizer(max_features = 5000)
X = vectorizer.fit_transform(reviews_cleaned)

In [12]:
# apply LDA model
lda = LatentDirichletAllocation(n_components = 10,
                                random_state = 42)
lda.fit(X)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


## Inspect Topics

In [13]:
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()

    # model.components_ = matrix of topic-word counts
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([words[i] for i in topic.argsort()[-top_n:][::-1]]) # get most representative words

print_topics(lda, vectorizer)

Topic 0:
['get', 'ride', 'day', 'time', 'pas', 'fast', 'park', 'line', 'go', 'disneyland']
Topic 1:
['hotel', 'queue', 'disney', 'food', 'park', 'get', 'euro', 'would', 'time', 'child']
Topic 2:
['park', 'disney', 'food', 'day', 'disneyland', 'ticket', 'good', 'one', 'train', 'attraction']
Topic 3:
['day', 'time', 'disneyland', 'ride', 'kid', 'year', 'great', 'place', 'visit', 'park']
Topic 4:
['ride', 'park', 'people', 'time', 'line', 'day', 'get', 'disney', 'one', 'hour']
Topic 5:
['park', 'show', 'ride', 'main', 'street', 'time', 'star', 'parade', 'disney', 'also']
Topic 6:
['ride', 'park', 'day', 'queue', 'character', 'time', 'disney', 'parade', 'good', 'great']
Topic 7:
['disney', 'park', 'disneyland', 'world', 'hong', 'kong', 'one', 'florida', 'visit', 'much']
Topic 8:
['place', 'disneyland', 'love', 'disney', 'christmas', 'time', 'always', 'like', 'magical', 'character']
Topic 9:
['ride', 'mountain', 'park', 'space', 'disney', 'disneyland', 'time', 'pirate', 'day', 'world']


## Visualization

In [17]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne')
panel

# Topic Model - BERTopic

In [None]:
from bertopic import BERTopic

# Topic Model - Top2Vec

In [None]:
from top2vec import Top2Vec