# NLP Sentiment Analysis of Disneyland Reviews

# Library Imports

In [8]:
import tensorflow as tf
from tensorflow import keras

# tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import random

# Data Loading

In [2]:
df = pd.read_csv("DisneylandReviews.csv", encoding = 'latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [3]:
df.shape

(42656, 6)

# Train Test Split

In [4]:
# text data
X = df['Review_Text'].tolist()

# labels
y = df['Rating'].tolist()

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [34]:
len(X_train), len(X_test), len(y_train), len(y_test)

(34124, 8532, 34124, 8532)

# Preprocessing

In [35]:
# inspect first sentence
print(X_train[0])
print(y_train[0])

It is great place to visit but it's very expensive, sky hi price of ticket and if you buy lowest price ticket which is still very high you must choose between the park and studios.
5


In [36]:
# get max num of words in a review
max_words = max(len(review.split()) for review in X_train)
print(max_words)

3963


### OOV : for words that are not originally from this dataset

In [37]:
# Tokenization
tokenizer = Tokenizer(num_words = max_words, oov_token='<OOV>')

# assign index to each token
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print(word_index) # give number index for every single word appeared in this entire dataset



In [38]:
# translate text to indices (i.e. tokenize each review into numerical inputs)
sequences = tokenizer.texts_to_sequences(X_train)
print(sequences[0]) 

[11, 12, 47, 59, 4, 68, 16, 72, 43, 140, 2499, 3058, 305, 6, 212, 3, 40, 14, 297, 1, 305, 212, 55, 12, 109, 43, 411, 14, 196, 794, 397, 2, 15, 3, 357]


### Padding : pad the sequences to have the same length

In [42]:
padded = pad_sequences(sequences,
                       maxlen = max_words,
                       padding = 'post',
                       truncating = 'post')
print(padded[0])

[11 12 47 ...  0  0  0]


In [43]:
# tokenizing test set
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_seq,
                       maxlen = max_words,
                       padding = 'post',
                       truncating = 'post')

# Model - Neural Network

# Topic Modeling

In [20]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

In [39]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors = 15,
                  n_components = 10, 
                  min_dist = 0.0,
                  metric = 'cosine')
hdbscan_model = HDBSCAN(min_cluster_size = 5,
                        min_samples = 5,
                        cluster_selection_epsilon = 0.0)

In [40]:
vectorizer = CountVectorizer(ngram_range = (1, 2), # compare 1-gram and 2-gram
                             stop_words = 'english')

example : 
- 'Hello Mom and Dad'

- 1-gram = 'Hello', 'Mom', 'and', 'Dad'
- 2-gram = 'Hello Mom', 'Mom and', 'and Dad'

In [41]:
# create instance of model
topic_model = BERTopic(
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    embedding_model = embedding_model,
    vectorizer_model = vectorizer,
    language = 'english',
    calculate_probabilities = True,
    verbose = True
)

In [42]:
sample_size = 3000
sample_text = random.sample(X, sample_size)

In [43]:
topics, probs = topic_model.fit_transform(sample_text)

2025-08-31 13:22:09,868 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 94/94 [00:11<00:00,  8.08it/s]
2025-08-31 13:22:21,575 - BERTopic - Embedding - Completed ✓
2025-08-31 13:22:21,575 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-31 13:22:29,081 - BERTopic - Dimensionality - Completed ✓
2025-08-31 13:22:29,083 - BERTopic - Cluster - Start clustering the reduced embeddings


AttributeError: No prediction data was generated

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,274,-1_park_disney_day_rides,"[park, disney, day, rides, time, disneyland, r...",[We spent a full day and 3 half days in the Ma...
1,0,2455,0_park_rides_disney_disneyland,"[park, rides, disney, disneyland, time, day, r...","[Being a regular Haunt at Disneyland Paris, I ..."
2,1,271,1_disneyland_hong_kong_hong kong,"[disneyland, hong, kong, hong kong, disney, pa...",[Let me preface this review by saying that I'v...


In [38]:
topic_model.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity