Manipulation of data top 1-5 zones

In [2]:
import pandas as pd
import re

# Load the dataset with a relative path
file_path = "../data/Theme Parks Survey Responses 4 Nov.csv"
df = pd.read_csv(file_path)

# Remove rows where the "How long has it been since your last visit to USS?" column contains "I have not visited USS before"
df = df[df['How long has it been since your last visit to USS?'] != 'I have not visited USS before']

# Rename relevant columns for easier reference
df.rename(columns={
    "Are you Singaporean/PR or Foreigner?": "Nationality",
    "Which age group do you belong to?": "Age_Group",
    "Choose the top 3 amenities / attractions that are most important to you when you visit the theme park.": "Top_Amenities",
    "Did you purchase an Express Pass?": "express_pass_purchase"
}, inplace=True)

# Clean up the Nationality column
df['Nationality'] = df['Nationality'].str.replace("I am a ", "", regex=False)

# Update ranking columns with full names (excluding Madagascar)
ranking_columns = [
    "Rank the following theme park zones based on the order in which you visited them. [Hollywood]",
    "Rank the following theme park zones based on the order in which you visited them. [New York]",
    "Rank the following theme park zones based on the order in which you visited them. [Sci-Fi City]",
    "Rank the following theme park zones based on the order in which you visited them. [Ancient Egypt]",
    "Rank the following theme park zones based on the order in which you visited them. [The Lost World]",
    "Rank the following theme park zones based on the order in which you visited them. [Far Far Away]"
]

# Extract the visited zones (ignoring Madagascar)
def extract_visited_zones(row):
    zones_with_ranks = {zone: rank for zone, rank in zip(ranking_columns, row)}
    visited_zones = {zone: rank for zone, rank in zones_with_ranks.items() if rank != "Did not visit"}

    # Extract the zone names, ensure at least 6 'Na' if no zones are found
    visited_zone_names = [zone.split('[')[-1].strip(']') for zone in visited_zones]
    return visited_zone_names if visited_zone_names else ['Na'] * 6  # Ensure 'Na' for all zones if none visited

# Apply the function to extract the visited zones and split into 6 columns
df[['first_visited_zone', 'second_visited_zone', 'third_visited_zone', 
    'fourth_visited_zone', 'fifth_visited_zone', 'sixth_visited_zone']] = pd.DataFrame(
    df[ranking_columns].apply(extract_visited_zones, axis=1).tolist(), index=df.index
)

# Ensure that all missing values are filled with 'Na' where appropriate (in case there are any blanks in the zones)
df[['first_visited_zone', 'second_visited_zone', 'third_visited_zone', 
    'fourth_visited_zone', 'fifth_visited_zone', 'sixth_visited_zone']] = df[['first_visited_zone', 'second_visited_zone', 'third_visited_zone', 
    'fourth_visited_zone', 'fifth_visited_zone', 'sixth_visited_zone']].fillna('Na')

# Clean column names by stripping newline and extra spaces
df.columns = df.columns.str.replace('\n', ' ').str.strip()

# Column name for the activity ranking
activities_column_name = "Arrange the following activities in the order in which you experienced them during your visit. A. Go to the rollercoasters B. Go to the Water-based rides C. Go to the F&B outlet D. Go and shop for souvenirs E. Go for a water/restroom break G. Go to the child-friendly rides H. Go to the Haunted House I. Go to the Simulator rides J. Sightseeing K. Go to the Spinning rides"

# Define activity mapping for the top activities
activity_mapping = {
    'A': 'Go to the rollercoasters',
    'B': 'Go to the Water-based rides',
    'C': 'Go to the F&B outlet',
    'D': 'Go and shop for souvenirs',
    'E': 'Go for a water/restroom break',
    'G': 'Go to the child-friendly rides',
    'H': 'Go to the Haunted House',
    'I': 'Go to the Simulator rides',
    'J': 'Sightseeing',
    'K': 'Go to the Spinning rides'
}

# Extract up to 14 activities
def extract_activities(activity_string):
    # Ensure activity_string is a valid string
    if isinstance(activity_string, str):
        activities = re.findall(r'([A-K])', activity_string)  # Match valid activities
        # Limit to the first 14 activities and ensure all 14 are returned, even if fewer activities are found
        activities = activities[:14]  # Take up to the first 14 activities
        activities += ['Na'] * (14 - len(activities))  # Pad with 'Na' if fewer than 14
        return [activity_mapping.get(a, 'Na') for a in activities]  # Convert to activity names
    else:
        return ['Na'] * 14  # Return 'Na' for all activities if the value is not a string

# Apply the function to extract the first to fourteenth activities
if activities_column_name in df.columns:
    activities_df = df[activities_column_name].apply(lambda x: pd.Series(extract_activities(x)))
    
    # Ensure the number of columns is 14 before renaming
    if activities_df.shape[1] == 14:
        activities_df.columns = [
            'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 
            'fifth_activity', 'sixth_activity', 'seventh_activity', 'eighth_activity', 
            'ninth_activity', 'tenth_activity', 'eleventh_activity', 'twelfth_activity', 
            'thirteenth_activity', 'fourteenth_activity'
        ]
        df = pd.concat([df, activities_df], axis=1)
    else:
        print(f"Error: The number of columns in activities_df is {activities_df.shape[1]}, expected 14.")
else:
    print(f"Column '{activities_column_name}' not found in the DataFrame.")

# Save the updated DataFrame to a new CSV file
output_file_path = "../data/Updated_Theme_Parks_Survey_Responses A14.csv"
df.to_csv(output_file_path, index=False)

print(f"File saved to {output_file_path}")


File saved to ../data/Updated_Theme_Parks_Survey_Responses A14.csv


Sanakey Diagram <br /> Sg/ Foreigner -> Age demographics -> Express pass or not -> Which zones

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Define labels for the Sankey diagram (removed Madagascar)
labels = [
    "Singaporean/PR", "Foreigner", 
    "Below 18", "Ages 18-25", "Ages 26-35", "Ages 36-59", "Ages 60 and above", 
    "Express Pass", "No Express Pass", 
    "Hollywood", "New York", "Sci-Fi City", "Ancient Egypt", "The Lost World", "Far Far Away",
    "Roller Coasters", "Water Rides", "Simulators", 
    "Restroom", "Food Stops", "Shopping",
    "Rides", "Restroom / Baby Nursing Room", "Restaurants / Food Stops", "Water Cooler",
    "Seating Zone", "Shelter", "Seasonal Attractions"
]

# Total responses to calculate percentages
total_responses = len(df)

# Prepare flow data
flow_data = {}  # Dictionary to hold source-target counts

# Loop through each guest in the dataframe
for index, row in df.iterrows():
    # 1. Nationality
    nationality_index = 0 if row['Nationality'] == "Singaporean/PR" else 1
    
    # 2. Age Group
    age_index = {
        "Below 18": 2, "18-25": 3, "26-35": 4, "36-59": 5, "60 and above": 6
    }.get(row['Age_Group'], None)
    if age_index is None:
        continue  # Skip if age group is not defined

    # 3. Express Pass Usage
    express_pass_index = 7 if row['express_pass_purchase'] == "Yes" else 8
    
    # 4. Theme Park Zones (identify first visited zone)
    zone_mapping = {
        "Hollywood": 9,
        "New York": 10,
        "Sci-Fi City": 11,
        "Ancient Egypt": 12,
        "The Lost World": 13,
        "Far Far Away": 14,
    }
    
    zone_index = None
    for zone, idx in zone_mapping.items():
        if row[f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'] == "1st":
            zone_index = idx
            break
            
    # If "Madagascar" is selected as the first zone, filter out this row and add a comment
    if "Madagascar" in row[f'Rank the following theme park zones based on the order in which you visited them. [{zone}]']:
        print(f"#zone has closed for row {index}")
        continue  # Skip this iteration if Madagascar was selected
    
    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found
    
    # 5. First Ride Preference - Removed activities section
    # Skip the activity mapping logic as per the request
    
    # Increment counts for flows
    flows = [
        (nationality_index, age_index),
        (age_index, express_pass_index),
        (express_pass_index, zone_index)
    ]
    
    for source_index, target_index in flows:
        if (source_index, target_index) in flow_data:
            flow_data[(source_index, target_index)] += 1
        else:
            flow_data[(source_index, target_index)] = 1

# Prepare source, target, and values lists for the Sankey diagram
source, target, values = zip(*[(s, t, v) for (s, t), v in flow_data.items()])

# Prepare the labels with percentages
labels_with_percentages = []

# Calculate percentage for Nationality and Age Group
# 1. Nationality percentages
for nationality in ["Singaporean/PR", "foreigner"]:
    nationality_count = df['Nationality'].value_counts().get(nationality, 0)
    nationality_percentage = (nationality_count / total_responses) * 100
    labels_with_percentages.append(f"{nationality}<br>({nationality_percentage:.1f}%)")

# 2. Age Group percentages
age_groups = ["Below 18", "18-25", "26-35", "36-59", "60 and above"]
for age_group in age_groups:
    age_group_count = df['Age_Group'].value_counts().get(age_group, 0)
    age_group_percentage = (age_group_count / total_responses) * 100
    labels_with_percentages.append(f"{age_group}<br>({age_group_percentage:.1f}%)")

# 3. Express Pass percentages
for express_pass in ["Yes", "No"]:
    express_pass_count = df['express_pass_purchase'].value_counts().get(express_pass, 0)
    express_pass_percentage = (express_pass_count / total_responses) * 100
    express_pass_label = "Express Pass" if express_pass == "Yes" else "No Express Pass"
    labels_with_percentages.append(f"{express_pass_label}<br>({express_pass_percentage:.1f}%)")

# 4. Zone percentages (removed Madagascar)
zone_mapping = {
    "Hollywood": "Hollywood", "New York": "New York", "Sci-Fi City": "Sci-Fi City", 
    "Ancient Egypt": "Ancient Egypt", "The Lost World": "The Lost World", "Far Far Away": "Far Far Away",
}
for zone in zone_mapping.values():
    zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'
    zone_count = df[zone_column].value_counts().get("1st", 0)
    zone_percentage = (zone_count / total_responses) * 100
    labels_with_percentages.append(f"{zone}<br>({zone_percentage:.1f}%)")

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=labels_with_percentages  # Use labels with percentages
    ),
    link=dict(
        source=source, 
        target=target, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Guest Segmentation and Journey Patterns", font_size=12)
fig.show()



### to debug last pillar not 100%


Sanakey Diagram <br /> Age demographics Which zones

In [46]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Define labels for the Sankey diagram (remove Madagascar)
labels = [
    "Below 18", "Ages 18-25", "Ages 26-35", "Ages 36-59", "Ages 60 and above",
    "Hollywood", "New York", "Sci-Fi City", "Ancient Egypt", "The Lost World", "Far Far Away"
]

# Total responses to calculate percentages
total_responses = len(df)

# Prepare flow data
flow_data = {}

# Loop through each guest in the dataframe
for index, row in df.iterrows():
    # 1. Age Group
    age_index = {
        "Below 18": 0, "18-25": 1, "26-35": 2, "36-59": 3, "60 and above": 4
    }.get(row['Age_Group'], None)
    if age_index is None:
        continue  # Skip if age group is not defined

    # 2. Theme Park Zones (identify first visited zone)
    zone_mapping = {
        "Hollywood": 5,
        "New York": 6,
        "Sci-Fi City": 7,
        "Ancient Egypt": 8,
        "The Lost World": 9,
        "Far Far Away": 10
    }
    
    zone_index = None
    for zone, idx in zone_mapping.items():
        if row[f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'] == "1st":
            zone_index = idx
            break
            
    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found
    
    # Increment counts for flows
    flows = [(age_index, zone_index)]
    
    for source_index, target_index in flows:
        if (source_index, target_index) in flow_data:
            flow_data[(source_index, target_index)] += 1
        else:
            flow_data[(source_index, target_index)] = 1

# Prepare source, target, and values lists for the Sankey diagram
source, target, values = zip(*[(s, t, v) for (s, t), v in flow_data.items()])

# Prepare the labels with percentages
labels_with_percentages = []

# Calculate percentage for Age Groups
for age_group in ["Below 18", "18-25", "26-35", "36-59", "60 and above"]:
    age_group_count = df['Age_Group'].value_counts().get(age_group, 0)
    age_group_percentage = (age_group_count / total_responses) * 100
    labels_with_percentages.append(f"{age_group}<br>({age_group_percentage:.1f}%)")

# Calculate percentage for Zones (exclude Madagascar and sum correctly)
zone_mapping = {
    "Hollywood": "Hollywood", "New York": "New York", "Sci-Fi City": "Sci-Fi City", 
    "Ancient Egypt": "Ancient Egypt", "The Lost World": "The Lost World", "Far Far Away": "Far Far Away"
}

zone_counts = {zone: 0 for zone in zone_mapping.values()}
for zone in zone_mapping.values():
    zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'
    zone_count = df[zone_column].value_counts().get("1st", 0)
    zone_counts[zone] = zone_count

# Recalculate total number of responses for the remaining zones
remaining_zone_responses = sum(zone_counts.values())

# Calculate percentages for each zone
for zone, zone_name in zone_mapping.items():
    zone_percentage = (zone_counts[zone] / remaining_zone_responses) * 100 if remaining_zone_responses > 0 else 0
    labels_with_percentages.append(f"{zone_name}<br>({zone_percentage:.1f}%)")

# Define y-coordinates for the nodes to maintain the desired order
y_coords = [
    0,  # Below 18
    1,  # Ages 18-25
    2,  # Ages 26-35
    3,  # Ages 36-59
    4,  # Ages 60 and above
    5,  # Hollywood
    6,  # New York
    7,  # Sci-Fi City
    8,  # Ancient Egypt
    9,  # The Lost World
    10, # Far Far Away
]

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=labels_with_percentages,  # Use labels with percentages
        y=y_coords  # Set y-coordinates to maintain order
    ),
    link=dict(
        source=source, 
        target=target, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Age Demographics to Theme Park Zones", font_size=12)
fig.show()


Sankey Diagram <br />  Foreigner Guests: Journey from Express Pass to Zones

In [17]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Define the categorical order for theme park zones according to your specified order (removed Madagascar)
zone_order = ["Sci-Fi City", "Hollywood", "New York", "Ancient Egypt", "Far Far Away", "The Lost World"]

# Filter only for "Foreigner" guests
foreigner_df = df[df['Nationality'] == 'foreigner']

# Total foreigner responses to calculate percentages
total_foreigner_responses = len(foreigner_df)

# Define labels for the Sankey diagram
labels = [
    "Foreigner",  # This will be 100%
    "Express Pass", "No Express Pass"
] + zone_order  # Using the updated zone order without Madagascar

# Map labels to index positions for `source` and `target`
label_indices = {label: idx for idx, label in enumerate(labels)}

# Prepare flow data
flow_counts = {}

# Loop through each foreign guest in the filtered dataframe
for index, row in foreigner_df.iterrows():
    # Express Pass Usage
    express_pass_index = 1 if row['express_pass_purchase'] == "Yes" else 2
    
    # Theme Park Zones (identify first visited zone)
    zone_index = None
    for zone in zone_order:
        if row.get(f'Rank the following theme park zones based on the order in which you visited them. [{zone}]') == "1st":
            zone_index = label_indices[zone]
            break

    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found

    # Append flows for nationality to express pass and then to zones
    # Create flow keys to count occurrences
    flow_key_express_pass = (label_indices["Foreigner"], express_pass_index)
    flow_key_zone = (express_pass_index, zone_index)

    # Count the flows for nationality to express pass
    if flow_key_express_pass in flow_counts:
        flow_counts[flow_key_express_pass] += 1
    else:
        flow_counts[flow_key_express_pass] = 1

    # Count the flows for express pass to theme park zones
    if flow_key_zone in flow_counts:
        flow_counts[flow_key_zone] += 1
    else:
        flow_counts[flow_key_zone] = 1

# Prepare source, target, and values lists for the Sankey diagram
source = []
target = []
values = []

# Calculate percentages and update labels
labels_with_percentages = []

# Set "Foreigner" as 100% in the first node
labels_with_percentages.append(f"Foreigner<br>(100%)")

# Calculate percentage for "Express Pass" and "No Express Pass"
express_pass_count = foreigner_df['express_pass_purchase'].value_counts().get("Yes", 0)
express_pass_percentage = (express_pass_count / total_foreigner_responses) * 100
labels_with_percentages.append(f"Express Pass<br>({express_pass_percentage:.1f}%)")

no_express_pass_count = foreigner_df['express_pass_purchase'].value_counts().get("No", 0)
no_express_pass_percentage = (no_express_pass_count / total_foreigner_responses) * 100
labels_with_percentages.append(f"No Express Pass<br>({no_express_pass_percentage:.1f}%)")

# Add the rest of the labels (zones) with percentages
for zone in zone_order:
    zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'
    zone_count = foreigner_df[zone_column].value_counts().get("1st", 0)
    zone_percentage = (zone_count / total_foreigner_responses) * 100
    labels_with_percentages.append(f"{zone}<br>({zone_percentage:.1f}%)")

# Now append the flows with values
for (src, tgt), val in flow_counts.items():
    source.append(src)
    target.append(tgt)
    values.append(val)

# Define the node and link data for Sankey
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels_with_percentages  # Use labels with percentages
    ),
    link=dict(
        source=source,
        target=target,
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Foreigner Guests: Journey from Express Pass to Zones", font_size=12)
fig.show()


Singaporean/PR Guests: Journey from Express Pass to Zones

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Define the categorical order for theme park zones according to your specified order (removed Madagascar)
zone_order = ["Sci-Fi City", "Hollywood", "New York", "Ancient Egypt", "Far Far Away", "The Lost World"]

# Filter only for "Singaporean/PR" guests
S_PR_df = df[df['Nationality'] == 'Singaporean/PR']

# Total Singaporean/PR responses to calculate percentages
total_S_PR_responses = len(S_PR_df)

# Define labels for the Sankey diagram
labels = [
    "Singaporean/PR",  # This will be 100%
    "Express Pass", "No Express Pass"
] + zone_order  # Using the updated zone order without Madagascar

# Map labels to index positions for `source` and `target`
label_indices = {label: idx for idx, label in enumerate(labels)}

# Prepare flow data
flow_counts = {}

# Loop through each Singaporean/PR guest in the filtered dataframe
for index, row in S_PR_df.iterrows():
    # Express Pass Usage
    express_pass_index = 1 if row['express_pass_purchase'] == "Yes" else 2
    
    # Theme Park Zones (identify first visited zone)
    zone_index = None
    for zone in zone_order:
        if row.get(f'Rank the following theme park zones based on the order in which you visited them. [{zone}]') == "1st":
            zone_index = label_indices[zone]
            break

    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found

    # Append flows for nationality to express pass and then to zones
    # Create flow keys to count occurrences
    flow_key_express_pass = (label_indices["Singaporean/PR"], express_pass_index)
    flow_key_zone = (express_pass_index, zone_index)

    # Count the flows for nationality to express pass
    if flow_key_express_pass in flow_counts:
        flow_counts[flow_key_express_pass] += 1
    else:
        flow_counts[flow_key_express_pass] = 1

    # Count the flows for express pass to theme park zones
    if flow_key_zone in flow_counts:
        flow_counts[flow_key_zone] += 1
    else:
        flow_counts[flow_key_zone] = 1

# Prepare source, target, and values lists for the Sankey diagram
source = []
target = []
values = []

# Calculate percentages and update labels
labels_with_percentages = []

# Set "Singaporean/PR" as 100% in the first node
labels_with_percentages.append(f"Singaporean/PR<br>(100%)")

# Calculate percentage for "Express Pass" and "No Express Pass"
express_pass_count = S_PR_df['express_pass_purchase'].value_counts().get("Yes", 0)
express_pass_percentage = (express_pass_count / total_S_PR_responses) * 100
labels_with_percentages.append(f"Express Pass<br>({express_pass_percentage:.1f}%)")

no_express_pass_count = S_PR_df['express_pass_purchase'].value_counts().get("No", 0)
no_express_pass_percentage = (no_express_pass_count / total_S_PR_responses) * 100
labels_with_percentages.append(f"No Express Pass<br>({no_express_pass_percentage:.1f}%)")

# Add the rest of the labels (zones) with percentages
for zone in zone_order:
    zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'
    zone_count = S_PR_df[zone_column].value_counts().get("1st", 0)
    zone_percentage = (zone_count / total_S_PR_responses) * 100
    labels_with_percentages.append(f"{zone}<br>({zone_percentage:.1f}%)")

# Now append the flows with values
for (src, tgt), val in flow_counts.items():
    source.append(src)
    target.append(tgt)
    values.append(val)

# Define the node and link data for Sankey
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels_with_percentages  # Use labels with percentages
    ),
    link=dict(
        source=source,
        target=target,
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Singaporean/PR Guests: Journey from Express Pass to Zones", font_size=12)
fig.show()

Age grp to purchase

In [55]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Define labels for the Sankey diagram with the corrected age group label
labels = [
    "Below 18", "18-25", "26-35", "36-59", "60 and above", 
    "Express Pass", "No Express Pass"
]

# Map labels to index positions for `source` and `target`
label_indices = {label: idx for idx, label in enumerate(labels)}

# Prepare flow data
flow_data = {}

# Calculate total number of responses for each age group
age_group_counts = df['Age_Group'].value_counts()

# Loop through each guest in the dataframe
for index, row in df.iterrows():
    # Use exact age group labels from data
    age_group = row['Age_Group']
    if age_group not in labels:
        continue  # Skip if age group is not in the specified labels
    age_index = label_indices[age_group]

    # Express Pass Usage
    express_pass_index = label_indices["Express Pass"] if row['express_pass_purchase'] == "Yes" else label_indices["No Express Pass"]
    
    # Increment counts for flows
    flow_key = (age_index, express_pass_index)
    if flow_key in flow_data:
        flow_data[flow_key] += 1
    else:
        flow_data[flow_key] = 1

# Prepare source, target, and values lists for the Sankey diagram
source, target, values = zip(*[(src, tgt, val) for (src, tgt), val in flow_data.items()])

# Now we calculate percentages and update the labels to include them
labels_with_percentages = []

# Add the age group labels with percentages
total_responses = len(df)
for age_group in ["Below 18", "18-25", "26-35", "36-59", "60 and above"]:  # Updated label
    total_age_group_responses = age_group_counts.get(age_group, 0)
    if total_age_group_responses > 0:
        age_percent = (total_age_group_responses / total_responses) * 100
        labels_with_percentages.append(f"{age_group}<br>({age_percent:.1f}%)")
    else:
        labels_with_percentages.append(age_group)

# Add the express pass labels with percentages
express_pass_count = df[df['express_pass_purchase'] == "Yes"].shape[0]
express_pass_percentage = (express_pass_count / total_responses) * 100
labels_with_percentages.append(f"Express Pass<br>({express_pass_percentage:.1f}%)")

no_express_pass_count = df[df['express_pass_purchase'] == "No"].shape[0]
no_express_pass_percentage = (no_express_pass_count / total_responses) * 100
labels_with_percentages.append(f"No Express Pass<br>({no_express_pass_percentage:.1f}%)")

# Create the Sankey diagram with updated labels
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=labels_with_percentages  # Updated labels with percentages
    ),
    link=dict(
        source=source, 
        target=target, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Age Group to Express Pass Purchase", font_size=12)
fig.show()


Family segmentation

In [19]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Standardize the 'Nationality' column by stripping spaces and converting to lowercase
df['Nationality'] = df['Nationality'].str.strip().str.lower()

# Define categories and labels
zone_order = ["Sci-Fi City", "Hollywood", "New York", "Ancient Egypt", "Far Far Away", "The Lost World"]
nationality_labels = ["singaporean/pr", "foreigner"]
companion_types = ["I am visiting with family", "I am visiting with friend(s)", "I am visiting with my partner", 
                   "I do visit with my family, friends and partner.", "Work colleagues"]
express_pass_labels = ["No", "Yes"]

# Full list of labels
labels = nationality_labels + companion_types + express_pass_labels + zone_order
label_indices = {label: idx for idx, label in enumerate(labels)}

# Total number of responses
total_responses = len(df)

# Helper function to calculate percentage for each category
def calculate_percentage(column, category):
    count = df[column].value_counts().get(category, 0)
    return (count / total_responses) * 100

# Prepare list to store labels with percentages
labels_with_percentages = []

# Calculate percentages for nationality labels
for label in nationality_labels:
    nationality_count = df[df['Nationality'] == label].shape[0]
    percentage = (nationality_count / total_responses) * 100
    labels_with_percentages.append(f"{label.capitalize()}<br>({percentage:.1f}%)")

# Calculate percentages for companion types
for label in companion_types:
    percentage = calculate_percentage('Did you visit the theme park with other people?', label)
    labels_with_percentages.append(f"{label}<br>({percentage:.1f}%)")

# Calculate percentages for express pass labels
for label in express_pass_labels:
    percentage = calculate_percentage('express_pass_purchase', label)
    labels_with_percentages.append(f"Express Pass<br>({label})<br>({percentage:.1f}%)")

# Calculate percentages for zone labels
for label in zone_order:
    zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{label}]'
    percentage = (df[zone_column].value_counts().get("1st", 0) / total_responses) * 100
    labels_with_percentages.append(f"{label}<br>({percentage:.1f}%)")



# Prepare flow data from dataset
flow_counts = {}
for _, row in df.iterrows():
    # Get nationality index
    nationality = row['Nationality']
    if pd.isnull(nationality): continue
    nationality_index = label_indices["singaporean/pr"] if nationality == 'singaporean/pr' else label_indices["foreigner"]
    
    # Get companion type index
    companion_type = row['Did you visit the theme park with other people?']
    if pd.isnull(companion_type): continue
    companion_label = {
        "I am visiting with family": "I am visiting with family", 
        "I am visiting with my partner": "I am visiting with my partner",
        "I am visiting with friend(s)": "I am visiting with friend(s)",
        "I do visit with my family, friends and partner.": "I do visit with my family, friends and partner.",
        "Work colleagues": "Work colleagues"
    }.get(companion_type, "Visiting alone")
    companion_index = label_indices[companion_label]
    
    # Get express pass status index
    express_pass = row['express_pass_purchase']
    if pd.isnull(express_pass): continue
    express_pass_label = "Yes" if express_pass == "Yes" else "No"
    express_pass_index = label_indices[express_pass_label]
    
    # Find the first visited zone
    zone_index = None
    for zone in zone_order:
        zone_column = f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'
        if row.get(zone_column) == "1st":
            zone_index = label_indices[zone]
            break
    if zone_index is None:
        continue

    # Collect flow data
    flow_keys = [
        (nationality_index, companion_index),
        (companion_index, express_pass_index),
        (express_pass_index, zone_index)
    ]
    for key in flow_keys:
        flow_counts[key] = flow_counts.get(key, 0) + 1

# Prepare source, target, values, and percentages lists
source = []
target = []
values = []
percentages = []

for (src, tgt), val in flow_counts.items():
    source.append(src)
    target.append(tgt)
    values.append(val)
    percentage = (val / total_responses) * 100
    percentages.append(f"{percentage:.1f}%")

# Define the Sankey figure with updated labels and hovertemplate to show percentages
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels_with_percentages
    ),
    link=dict(
        source=source,
        target=target,
        value=values,
        customdata=percentages,
        hovertemplate="Flow: %{value} (<b>%{customdata}</b>)<extra></extra>"  # Display flow value and percentage
    )
))

# Update layout and display figure
fig.update_layout(title_text="Guest Journey: Nationality to Companion Type to Express Pass to Zone", font_size=12)
fig.show()


Express Pass Purchases: Singaporean/PR vs Foreigner

In [5]:
import pandas as pd
import plotly.express as px

# Load the dataset
file_path = r'../data/Updated_Theme_Parks_Survey_Responses 5.csv'
df = pd.read_csv(file_path)

# Filter the data by nationality groups and calculate the percentages of Express Pass purchases
# Create a new column to indicate if the Express Pass was purchased
df['Express Pass Purchased'] = df['express_pass_purchase'].apply(lambda x: 'Yes' if x == 'Yes' else 'No')

# Group by nationality and Express Pass purchase, and calculate the percentage for each group
express_pass_summary = df.groupby(['Nationality', 'Express Pass Purchased']).size().unstack(fill_value=0)
express_pass_summary['Total'] = express_pass_summary['Yes'] + express_pass_summary['No']
express_pass_summary['Yes %'] = (express_pass_summary['Yes'] / express_pass_summary['Total']) * 100
express_pass_summary['No %'] = (express_pass_summary['No'] / express_pass_summary['Total']) * 100

# Reset the index for easy plotting and melt the data to have separate columns for Yes and No percentages
express_pass_summary.reset_index(inplace=True)
express_pass_summary = express_pass_summary.melt(id_vars='Nationality', value_vars=['Yes %', 'No %'],
                                                 var_name='Express Pass', value_name='Percentage')

# Plot the data using Plotly Express
fig = px.bar(express_pass_summary, x='Nationality', y='Percentage', color='Express Pass',
             title='Express Pass Purchases: Singaporean/PR vs Foreigner',
             labels={'Percentage': 'Percentage of Guests'},
             barmode='group',
             text='Percentage')  # Add percentage text on top of each bar

# Format the text on the bars to show percentages with 1 decimal place
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')

# Update layout to set axis titles and limit y-axis range to 0-90%
fig.update_layout(
    yaxis=dict(title='Percentage', range=[0, 95]), 
    xaxis=dict(title='Nationality')
)

fig.show()
