Sanakey Diagram <br /> Sg/ Foreigner -> Age demographics -> Express pass or not -> Which zones

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'C:\Users\knnth\Dropbox\PC\Desktop\DSA3101\Project\Data\Cleaned_Survey_Responses.xlsx'
df = pd.read_excel(file_path)

# Define labels for the Sankey diagram
labels = [
    "Singaporean/PR", "Foreigner", 
    "Below 18", "Ages 18-25", "Ages 26-35", "Ages 36-59", "Ages 60 and above", 
    "Express Pass", "No Express Pass", 
    "Hollywood", "New York", "Sci-Fi City", "Ancient Egypt", "The Lost World", "Far Far Away", "Madagascar",
    "Roller Coasters", "Water Rides", "Simulators", 
    "Restroom", "Food Stops", "Shopping",
    "Rides", "Restroom / Baby Nursing Room", "Restaurants / Food Stops", "Water Cooler",
    "Seating Zone", "Shelter", "Seasonal Attractions"
]

# Prepare flow data
flow_data = {}  # Dictionary to hold source-target counts

# Loop through each guest in the dataframe
for index, row in df.iterrows():
    # 1. Nationality
    nationality_index = 0 if row['Nationality'] == "Singaporean/PR" else 1
    
    # 2. Age Group
    age_index = {
        "Below 18": 2, "18-25": 3, "26-35": 4, "36-59": 5, "60 and above": 6
    }.get(row['Age_Group'], None)
    if age_index is None:
        continue  # Skip if age group is not defined

    # 3. Express Pass Usage
    express_pass_index = 7 if row['express_pass_purchase'] == "Yes" else 8
    
    # 4. Theme Park Zones (identify first visited zone)
    zone_mapping = {
        "Hollywood": 9,
        "New York": 10,
        "Sci-Fi City": 11,
        "Ancient Egypt": 12,
        "The Lost World": 13,
        "Far Far Away": 14,
        "Madagascar": 15
    }
    
    zone_index = None
    for zone, idx in zone_mapping.items():
        if row[f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'] == "1st":
            zone_index = idx
            break
            
    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found
    
    # 5. First Ride Preference
    first_activity = row['First_Activity']
    ride_choice = first_activity.strip().lower() if isinstance(first_activity, str) else ''
    ride_mapping = {
        'roller coasters': 16,
        'water rides': 17,
        'simulators': 18,
        'restroom': 19,
        'food stops': 20,
        'shopping': 21
    }
    first_ride_index = ride_mapping.get(ride_choice, 16)  # Default to roller coasters if not found

    # Increment counts for flows
    flows = [
        (nationality_index, age_index),
        (age_index, express_pass_index),
        (express_pass_index, zone_index)
    ]
    
    for source_index, target_index in flows:
        if (source_index, target_index) in flow_data:
            flow_data[(source_index, target_index)] += 1
        else:
            flow_data[(source_index, target_index)] = 1

# Prepare source, target, and values lists for the Sankey diagram
source, target, values = zip(*[(s, t, v) for (s, t), v in flow_data.items()])

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=labels
    ),
    link=dict(
        source=source, 
        target=target, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Guest Segmentation and Journey Patterns", font_size=12)
fig.show()

Sanakey Diagram <br /> Age demographics Which zones

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'C:\Users\knnth\Dropbox\PC\Desktop\DSA3101\Project\Data\Cleaned_Survey_Responses.xlsx'
df = pd.read_excel(file_path)

# Define labels for the Sankey diagram
labels = [
    "Below 18", "Ages 18-25", "Ages 26-35", "Ages 36-59", "Ages 60 and above",
    "Hollywood", "New York", "Sci-Fi City", "Ancient Egypt", "The Lost World", "Far Far Away", "Madagascar"
]

# Prepare flow data
flow_data = {}

# Loop through each guest in the dataframe
for index, row in df.iterrows():
    # 1. Age Group
    age_index = {
        "Below 18": 0, "18-25": 1, "26-35": 2, "36-59": 3, "60 and above": 4
    }.get(row['Age_Group'], None)
    if age_index is None:
        continue  # Skip if age group is not defined

    # 2. Theme Park Zones (identify first visited zone)
    zone_mapping = {
        "Hollywood": 5,
        "New York": 6,
        "Sci-Fi City": 7,
        "Ancient Egypt": 8,
        "The Lost World": 9,
        "Far Far Away": 10,
        "Madagascar": 11
    }
    
    zone_index = None
    for zone, idx in zone_mapping.items():
        if row[f'Rank the following theme park zones based on the order in which you visited them. [{zone}]'] == "1st":
            zone_index = idx
            break
            
    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found
    
    # Increment counts for flows
    flows = [(age_index, zone_index)]
    
    for source_index, target_index in flows:
        if (source_index, target_index) in flow_data:
            flow_data[(source_index, target_index)] += 1
        else:
            flow_data[(source_index, target_index)] = 1

# Prepare source, target, and values lists for the Sankey diagram
source, target, values = zip(*[(s, t, v) for (s, t), v in flow_data.items()])

# Define y-coordinates for the nodes to maintain the desired order
y_coords = [
    0,  # Below 18
    1,  # Ages 18-25
    2,  # Ages 26-35
    3,  # Ages 36-59
    4,  # Ages 60 and above
    5,  # Hollywood
    6,  # New York
    7,  # Sci-Fi City
    8,  # Ancient Egypt
    9,  # The Lost World
    10, # Far Far Away
    11  # Madagascar
]

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=labels,
        y=y_coords  # Set y-coordinates to maintain order
    ),
    link=dict(
        source=source, 
        target=target, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Guest Segmentation and Journey Patterns", font_size=12)
fig.show()

Sankey Diagram <br />  Foreigner Guests: Journey from Express Pass to Zones

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
file_path = r'C:\Users\knnth\Dropbox\PC\Desktop\DSA3101\Project\Data\Cleaned_Survey_Responses.xlsx'
df = pd.read_excel(file_path)

# Define the categorical order for theme park zones according to your specified order
zone_order = ["Sci-Fi City", "Hollywood", "New York", "Ancient Egypt", "Far Far Away", "Madagascar", "The Lost World"]

# Filter only for "Foreigner" guests
foreigner_df = df[df['Nationality'] == 'foreigner']

# Define labels for the Sankey diagram
labels = [
    "Foreigner", 
    "Express Pass", "No Express Pass"
] + zone_order  # Using the updated zone order

# Map labels to index positions for `source` and `target`
label_indices = {label: idx for idx, label in enumerate(labels)}

# Prepare flow data
flow_counts = {}

# Loop through each foreign guest in the filtered dataframe
for index, row in foreigner_df.iterrows():
    # Express Pass Usage
    express_pass_index = 1 if row['express_pass_purchase'] == "Yes" else 2
    
    # Theme Park Zones (identify first visited zone)
    zone_index = None
    for zone in zone_order:
        if row.get(f'Rank the following theme park zones based on the order in which you visited them. [{zone}]') == "1st":
            zone_index = label_indices[zone]
            break

    # Ensure zone_index is assigned
    if zone_index is None:
        continue  # Skip this iteration if no zone is found

    # Append flows for nationality to express pass and then to zones
    # Create flow keys to count occurrences
    flow_key_express_pass = (label_indices["Foreigner"], express_pass_index)
    flow_key_zone = (express_pass_index, zone_index)

    # Count the flows for nationality to express pass
    if flow_key_express_pass in flow_counts:
        flow_counts[flow_key_express_pass] += 1
    else:
        flow_counts[flow_key_express_pass] = 1

    # Count the flows for express pass to theme park zones
    if flow_key_zone in flow_counts:
        flow_counts[flow_key_zone] += 1
    else:
        flow_counts[flow_key_zone] = 1

# Prepare source, target, and values lists for the Sankey diagram
source = []
target = []
values = []

for (src, tgt), val in flow_counts.items():
    source.append(src)
    target.append(tgt)
    values.append(val)

# Define the node and link data for Sankey
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels
    ),
    link=dict(
        source=source,
        target=target,
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Foreigner Guests: Journey from Express Pass to Zones", font_size=12)
fig.show()