In [1]:
import pandas as pd

# Load the uploaded DisneylandReviews.csv file
file_path = "/Users/lianzou/Desktop/Learning Everything/Disney-itinerary/data/DisneylandReviews.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# Display the first few rows and column names to understand the structure
df.head(), df.columns.tolist()

(   Review_ID  Rating Year_Month     Reviewer_Location  \
 0  670772142       4     2019-4             Australia   
 1  670682799       4     2019-5           Philippines   
 2  670623270       4     2019-4  United Arab Emirates   
 3  670607911       4     2019-4             Australia   
 4  670607296       4     2019-4        United Kingdom   
 
                                          Review_Text               Branch  
 0  If you've ever been to Disneyland anywhere you...  Disneyland_HongKong  
 1  Its been a while since d last time we visit HK...  Disneyland_HongKong  
 2  Thanks God it wasn   t too hot or too humid wh...  Disneyland_HongKong  
 3  HK Disneyland is a great compact park. Unfortu...  Disneyland_HongKong  
 4  the location is not in the city, took around 1...  Disneyland_HongKong  ,
 ['Review_ID',
  'Rating',
  'Year_Month',
  'Reviewer_Location',
  'Review_Text',
  'Branch'])

In [2]:
df_CA = df[df['Branch'] == 'Disneyland_California'].copy()

In [3]:
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if not already available
nltk.download("stopwords")

# Define stopwords and basic text preprocessing
stop_words = set(stopwords.words("english"))

def preprocess_review(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return " ".join(tokens)

# Apply preprocessing
df_CA["Cleaned_Review"] = df_CA["Review_Text"].apply(preprocess_review)

# Show a sample of cleaned reviews
df_CA[["Review_Text", "Cleaned_Review"]].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lianzou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review_Text,Cleaned_Review
9620,This place has always been and forever will be...,place always forever special feeling get enter...
9621,A great day of simple fun and thrills. Bring c...,great day simple fun thrills bring cash nothin...
9622,All and all a great day was had. The crowds ar...,great day crowds huge ride times sometimes min...
9623,Having been to the Florida location numerous t...,florida location numerous times years didnt kn...
9624,"Had the 4 day pass, spent 3 at DL and one at C...",day pass spent dl one ca great place visit bac...


In [4]:
df_CA = df_CA[df_CA['Year_Month'] != "missing"]

In [5]:
df_CA.drop(columns=['Branch'], inplace=True)

In [6]:
df_CA

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Cleaned_Review
9620,670801367,5,2019-4,United States,This place has always been and forever will be...,place always forever special feeling get enter...
9621,670760708,5,2019-4,United States,A great day of simple fun and thrills. Bring c...,great day simple fun thrills bring cash nothin...
9622,670565072,4,2019-5,Australia,All and all a great day was had. The crowds ar...,great day crowds huge ride times sometimes min...
9623,670544335,5,2019-4,United States,Having been to the Florida location numerous t...,florida location numerous times years didnt kn...
9624,670472278,5,2019-4,Canada,"Had the 4 day pass, spent 3 at DL and one at C...",day pass spent dl one ca great place visit bac...
...,...,...,...,...,...,...
28499,92494269,1,2010-12,Canada,"Myself, along with my two chidren ages 8 and 1...",along two chidren ages visited disneyland cali...
28500,92313324,4,2010-12,United States,We love Disneyland so much that we go there of...,love disneyland much go often lately service a...
28501,91799423,5,2010-10,Australia,As this was part of our international conferen...,part international conference little spoilt ex...
28502,91657810,4,2010-12,Australia,we spent one day at disneyland withmy sister ...,spent one day disneyland withmy sister enough ...


In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment score
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']  # Sentiment score

# Apply sentiment analysis to the review text
df_CA['sentiment'] = df_CA['Cleaned_Review'].apply(get_sentiment)

# Check the dataframe with sentiment scores
print(df_CA[['Cleaned_Review', 'sentiment', "Rating"]].head())


                                         Cleaned_Review  sentiment  Rating
9620  place always forever special feeling get enter...     0.7845       5
9621  great day simple fun thrills bring cash nothin...     0.9595       5
9622  great day crowds huge ride times sometimes min...     0.8402       4
9623  florida location numerous times years didnt kn...     0.9624       5
9624  day pass spent dl one ca great place visit bac...     0.4939       5


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_CA['Cleaned_Review'])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df_CA['cluster'] = kmeans.fit_predict(X)

# Check the clusters
print(df_CA[['Cleaned_Review', 'cluster']].head())

                                         Cleaned_Review  cluster
9620  place always forever special feeling get enter...        0
9621  great day simple fun thrills bring cash nothin...        0
9622  great day crowds huge ride times sometimes min...        3
9623  florida location numerous times years didnt kn...        2
9624  day pass spent dl one ca great place visit bac...        3


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Assume 'user_input' contains the user's preferences as a string
user_input = "I want relaxing rides with positive reviews."

# Convert user input to TF-IDF features
user_input_vector = vectorizer.transform([user_input])
X_text = vectorizer.fit_transform(df_CA['Cleaned_Review'])

# Calculate similarity between user input and all reviews in the dataset
similarity_scores = cosine_similarity(user_input_vector, X_text)

# Add similarity scores to the dataframe
df_CA['similarity_score'] = similarity_scores.flatten()

# Sort the dataframe by similarity score (highest first)
df_sorted = df_CA.sort_values(by='similarity_score', ascending=False)

# Show the top 5 reviews that are most similar to the user's input
print(df_CA[['Cleaned_Review', 'similarity_score', 'cluster']].head())


                                         Cleaned_Review  similarity_score  \
9620  place always forever special feeling get enter...          0.000000   
9621  great day simple fun thrills bring cash nothin...          0.000000   
9622  great day crowds huge ride times sometimes min...          0.050567   
9623  florida location numerous times years didnt kn...          0.020473   
9624  day pass spent dl one ca great place visit bac...          0.080083   

      cluster  
9620        0  
9621        0  
9622        3  
9623        2  
9624        3  


In [10]:
from sklearn.preprocessing import StandardScaler
X_numeric = df_CA[['sentiment', "Rating"]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Combine the text and numeric features
from scipy.sparse import hstack
X_combined = hstack([X_text, X_scaled])

In [11]:
import numpy as np
from collections import Counter

# Get feature names (words) from the TF-IDF vectorizer
feature_names = np.array(vectorizer.get_feature_names_out())

# Function to get the top N words for each cluster
def get_top_words_for_clusters_no_duplicates(kmeans, X, top_n=500):
    top_words = {}
    all_words = set()  # To keep track of words already used across clusters
    
    for cluster_num in range(kmeans.n_clusters):
        # Get the indices of the words with the highest TF-IDF scores for this cluster
        cluster_center = kmeans.cluster_centers_[cluster_num]
        top_word_indices = cluster_center.argsort()[-top_n:][::-1]
        
        # Get the corresponding words
        top_words_in_cluster = feature_names[top_word_indices]
        
        # Remove words that have already appeared in other clusters
        unique_words = [word for word in top_words_in_cluster if word not in all_words]
        
        # Add these words to the set of all words to avoid duplication
        all_words.update(unique_words)
        
        top_words[cluster_num] = unique_words
    
    return top_words

# Get the top 10 unique words for each cluster
top_words_no_duplicates = get_top_words_for_clusters_no_duplicates(kmeans, X_combined)

# Display the top unique words for each cluster
for cluster_num, words in top_words_no_duplicates.items():
    print(f"Cluster {cluster_num}: {', '.join(words)}")

Cluster 0: disneyland, time, great, park, fun, rides, love, kids, place, day, family, visit, year, disney, food, loved, experience, like, old, amazing, lines, went, good, years, trip, really, going, people, crowded, days, best, crowds, worth, staff, times, long, fireworks, parade, wonderful, magical, little, wait, clean, enjoy, new, expensive, christmas, lots, children, enjoyed, characters, make, ride, dont, friendly, attractions, busy, say, come, halloween, night, awesome, california, lot, way, better, kid, sure, got, th, things, magic, parks, happy, recommend, ages, visited, took, young, didnt, cast, fantastic, nice, think, week, shows, season, special, early, adventure, daughter, need, adults, prices, feel, birthday, definitely, line, money, especially, spent, memories, blast, weather, helpful, pass, bad, age, members, world, service, know, parades, bit, beautiful, favorite, bring, closed, big, holiday, mickey, spend, want, child, try, visiting, thing, water, hours, theme, life, hot

In [12]:
cluster_names = {
    0: "General Disneyland Experience",
    1: "Positive Sentiment & Happiness",
    2: "Thrill and Adventure Rides",
    3: "Practical Tips and Logistics",
    4: "Hotels and Resort Experience"
}

# Map the cluster labels to their respective names
df_CA['cluster_name'] = df_CA['cluster'].map(cluster_names)

# Display the DataFrame with the new cluster names
print(df_CA[['Cleaned_Review', 'cluster', 'cluster_name']].head())


                                         Cleaned_Review  cluster  \
9620  place always forever special feeling get enter...        0   
9621  great day simple fun thrills bring cash nothin...        0   
9622  great day crowds huge ride times sometimes min...        3   
9623  florida location numerous times years didnt kn...        2   
9624  day pass spent dl one ca great place visit bac...        3   

                       cluster_name  
9620  General Disneyland Experience  
9621  General Disneyland Experience  
9622   Practical Tips and Logistics  
9623     Thrill and Adventure Rides  
9624   Practical Tips and Logistics  


In [13]:
month_map = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

df_CA[['Year', 'Month']] = df['Year_Month'].str.split('-', expand=True)
df_CA['Month_Name'] = df_CA['Month'].astype(int).map(month_map)  # Convert month number to name


In [14]:
df_CA.drop(columns=["similarity_score"], inplace=True)

In [15]:
df_CA.Month = df_CA.Month.astype(int)

In [16]:
import re

def tag_preferences_from_text(df):
    keyword_map = {
        # Family Composition
        'tag_has_children': [
            r'\b(kids?|children|little ones|toddlers?|babies|youngsters?|brought the kids|family-friendly|stroller|high chair|naps?|play area|girls|boys|kiddie rides?)\b'
        ],
        'tag_has_teenagers': [
            r'\b(teenagers?|teens?|preteens?|older kids?|high school age|my teenager|brought our teen|teen-approved|teen-friendly)\b'
        ],

        # Accessibility and Special Needs
        'tag_needs_guest_services': [
            r'\b(guest services|wheelchair|rent(ed)? (a )?(scooter|ecv)|mobility (aid|device|issue|support)|assistance|accompanied by caregiver|accessibility|disabled|accessible entrance|medical device|walker|scooter accessible|injury|broken leg)\b'
        ],
        'tag_needs_accommodations': [
            r'\b(autism|asd|adhd|sensory (issues|needs|friendly)|stimulating|anxiety|panic attack|accommodation(s)?|special needs|neurodivergent|quiet space|disability pass|DAS|guest assistance|mental health|stim toy)\b'
        ],

        # Sensory/Emotional Needs
        'tag_prefers_quiet': [
            r'\b(quiet(er)?|peaceful|calm(er)?|less crowded|not crowded|not too loud|low noise|escape the noise|break from the crowds|crowd-averse|needed a break|introvert|sensory break|less stimulation)\b'
        ],

        # Visitor Origin
        'tag_international_visitor': [
            r'\b(flew in|from (abroad|overseas|europe|canada|australia|asia|uk|england|mexico|brazil|france|japan|another country)|international visitor|visiting from overseas|foreign trip|long flight|traveled internationally)\b'
        ],

        # Timing and Seasonality
        'tag_peak_season_visitor': [
            r'\b(spring break|holiday (week|season)|summer vacation|Christmas week|Halloween night|President(’s|s)? Day|Labor Day|Fourth of July|New Year(\'s)?|peak season|very crowded|long lines|school break|packed day|busy holiday)\b'
        ],
        'tag_special_event_attendee': [
            r'\b(oogie boogie bash|mickey’s not-so-scary|halloween party|star wars nite|villains nite|sweethearts nite|grad nite|throwback nite|princess nite|disney after dark|ticketed event|night event|special event|exclusive event)\b'
        ],
        'tag_early_or_late_arrival': [
            r'\b(rope drop|got there at [0-9]+(am| a\.m\.)|arrived early|arrived late|missed (fireworks|show)|stayed until closing|early access|entered at night|after dark|morning arrival|closed the park)\b'
        ],

        # Budget Sensitivity
        'tag_budget_conscious': [
            r'\b(expensive|pricey|overpriced|budget|affordable|cost too much|worth the money|not worth (it|$[0-9]+)|splurged|too much money|cheap(er)?|budget-friendly|financially prepared|high cost|save money)\b'
        ],

        # Dietary Preferences
        'tag_dietary_restrictions': [
            r'\b(gluten[- ]?free|vegan|vegetarian|nut allergy|peanut allergy|dairy[- ]?free|lactose intolerant|kosher|halal|allergy-friendly|food allergy|celiac|dietary restriction|food sensitivity|food-safe|ingredient list)\b'
        ],
        'tag_foodie_focus': [
            r'\b(foodie|snacks?|ate at|blue bayou|lamplight lounge|tiki juice bar|dole whip|beignets?|turkey leg|corn dog|mac and cheese cone|tried everything|best meal|loved the food|restaurant|dining|reservation for food|food experience|eating was a highlight|character dining|festival of holidays|food & wine|flavors of disneyland|seasonal snacks)\b'
        ],

        # Ride Preferences
        'tag_thrill_seeker': [
            r'\b(thrill ride|roller coaster|intense|fast|scary|adrenaline|drop ride|space mountain|incredicoaster|tower of terror|guardians|radiator springs racers|big thunder|expedition|loop|screamer|splash mountain|scarier|heart-pounding)\b'
        ],
        'tag_relaxed_rider': [
            r'\b(scenic ride|relaxing|slow ride|nostalgic|small world|storybook|jungle cruise|peter pan|canoes|train ride|carousel|gentle|calm|peaceful ride|good for relaxing|good break ride|classic ride|nostalgic favorite|kids\' ride)\b'
        ],

        # Visitor History
        'tag_first_time_visitor': [
            r'\b(first time|never been|bucket list|always wanted to go|finally went|first visit|first trip|brand new experience|first-timer|first-timers|rookie|newbie)\b'
        ],
        'tag_frequent_visitor': [
            r'\b(annual passholder|AP|magic key|we go every year|we always|returning|back again|multiple times|frequent guest|local visitor|season pass|regulars|recurring visitor)\b'
        ],

        # Planning Style
        'tag_planner': [
            r'\b(genie\+|lightning lane|LL strategy|planned our (day|route)|itinerary|schedule|reserving (rides|restaurants)|well-organized|mobile order|disney app|timed things|efficiency|used an app|virtual queue|planned ahead|strategized|maximizing time)\b'
        ],
        'tag_go_with_the_flow': [
            r'\b(no plan|winged it|just wandered|improvised|walked around|no schedule|took it slow|explored freely|didn’t plan|unstructured|spontaneous|meandered|let the day unfold)\b'
        ],

        # Weather and Comfort
        'tag_weather_sensitive': [
            r'\b(too hot|heat stroke|sunburn|rained|cold|freezing|hot day|warm weather|rainy day|wore poncho|bad weather|weather ruined|umbrella|sun was brutal|misting fan|needed AC|cooling off|shade break|hydrated|sweaty|humid|soaked)\b'
        ],

        # Rest/Break Needs
        'tag_needs_rest_breaks': [
            r'\b(tired|needed rest|found a bench|took a break|sat down|rested|break time|overwhelmed|needed to recharge|long day|nursing station|baby center|took it slow|relaxed mid-day|too exhausting|nap break|quiet area|out of energy|midday break|rested feet)\b'
        ]
    }

    # Apply the tags
    for tag, patterns in keyword_map.items():
        df[tag] = df['Review_Text'].str.lower().apply(
            lambda text: int(any(re.search(p, text) for p in patterns))
        )

    return df

In [17]:
df_CA_tags = tag_preferences_from_text(df_CA)

In [18]:
df_CA.to_csv("~/Downloads/DisneylandReviews_CA.tsv", sep='\t', index=False, encoding='utf-8')
df_CA_tags.to_csv("~/Downloads/DisneylandReviews_CA_tags.tsv", sep='\t', index=False, encoding='utf-8')

In [19]:
df_CA_tags.to_csv("DisneylandReviews_CA_tags_pipe.csv", index=False, sep='|', encoding='utf-8')