Theme Park Activity Flow from Hollywood

In [8]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define activity columns and valid activities
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

valid_activities = [
    "Go To The F&B Outlet", "Go And Shop For Souvenirs", "Go For A Water/Restroom Break", 
    "Go To The Haunted House", "Sightseeing"
]

# Replace any activity not in valid_activities with 'Na'
for col in activity_columns:
    df[col] = df[col].str.strip().str.title().fillna('Na').map(lambda x: x if x in valid_activities else 'Na')

# Create the valid activity mapping
activity_mapping = {
    "Go To The F&B Outlet": "F&B outlet",
    "Go And Shop For Souvenirs": "Shop for souvenirs",
    "Go For A Water/Restroom Break": "Water/restroom break",
    "Go To The Haunted House": "Haunted House",
    "Sightseeing": "Sightseeing",
}

# Initialize the list for valid activities
valid_activity_df = []

# Process each row and find the valid activities
for index, row in df.iterrows():
    activity_indices = []
    for i in range(14):  # Check from 1st to 14th activity
        activity_name = row[activity_columns[i]]
        if activity_name in activity_mapping:
            normalized_activity = activity_mapping[activity_name]
            activity_indices.append(normalized_activity)

    # Ensure the activity_indices has at least 14 activities, filling up missing ones with the first activity
    while len(activity_indices) < 14:
        activity_indices.append(activity_indices[0] if activity_indices else "Na")

    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (up to 14 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(14)])

# Remove rows where the first activity is 'Na'
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Get the top 5 most frequent activities
top_5_activities = [activity for activity, _ in activity_counts.most_common(5)]

# Filter the dataframe to only include rows with top 5 activities
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_5_activities else 'Na'))

# Calculate percentage for each activity in each of the columns (activity_1 to activity_5)
for col in ['activity_1', 'activity_2', 'activity_3', 'activity_4', 'activity_5']:
    activity_percentage = filtered_df[col].value_counts(normalize=True) * 100
    # Update the label with the percentage for each activity
    filtered_df[col] = filtered_df[col].map(lambda x: f"{x}<br>({activity_percentage.get(x, 0):.2f}%)" if x != 'Na' else 'Na')

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Add the start node "Hollywood" to the nodes list
start_node = "Hollywood"
nodes = [start_node] + list(pd.Series(activities).unique())

# Add "Hollywood" to the activity_to_index dictionary with its index
activity_to_index = {start_node: 0}  # Manually add Hollywood with index 0

# Manually add 100% for Hollywood
nodes[0] = f"{start_node}<br>(100%)"

# Create a dictionary for activity to index mapping
for index, activity in enumerate(nodes[1:], start=1):  # Start from index 1 for the rest
    activity_to_index[activity] = index

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = f"2: {row['activity_2']}" if row['activity_2'] != "Na" else "Na"
    third = f"3: {row['activity_3']}" if row['activity_3'] != "Na" else "Na"
    fourth = f"4: {row['activity_4']}" if row['activity_4'] != "Na" else "Na"
    fifth = f"5: {row['activity_5']}" if row['activity_5'] != "Na" else "Na"

    # Add to the dictionary if second, third, fourth, or fifth prefixed activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="Theme Park Activity Flow from Hollywood", font_size=12)

# Show the Sankey diagram
fig.show()


In [33]:
# Calculate percentage for each node (excluding "Na")
total_count = sum(link_counts.values())

# Initialize node percentages
node_percentages = {node: 0 for node in nodes}
node_percentages[start_node] = 100  # Set Hollywood to 100% as it's the starting point

# Calculate node percentages based on counts in each activity column
for col in activity_columns:
    activity_column_counts = df[col].value_counts(normalize=True) * 100  # Percentage calculation
    for activity, percentage in activity_column_counts.items():
        if activity != "Na":  # Skip Na values
            node_percentages[activity_mapping.get(activity, activity)] += percentage

# Normalize the node percentages so that they sum up to 100% for each activity column
for col in activity_columns:
    column_total = df[col].value_counts().sum()
    if column_total > 0:
        activity_column_counts = df[col].value_counts(normalize=True) * 100
        for activity, percentage in activity_column_counts.items():
            if activity != "Na":  # Skip Na values
                node_percentages[activity_mapping.get(activity, activity)] += percentage

# Prepare nodes with percentages
nodes_with_percentages = []
for node in nodes:
    percentage = node_percentages[node]
    nodes_with_percentages.append(f"{node}<br>({percentage:.1f}%)")

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, 
        thickness=20,
        line=dict(color="black", width=0.5), 
        label=nodes_with_percentages  # Use labels with percentages
    ),
    link=dict(
        source=sources, 
        target=targets, 
        value=values
    )
))

# Add title and show figure
fig.update_layout(title_text="Theme Park Activity Flow", font_size=12)
fig.show()


Theme Park Activity Flow from Sci-Fi City (5 activities)

In [58]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of all possible activities (excluding "Go to the Water-based rides" and "Go to the child-friendly rides")
all_possible_activities = [
    "Go to the rollercoasters", "Go to the F&B outlet", 
    "Go and shop for souvenirs", "Go for a water/restroom break", 
    "Go to the Haunted House", "Go to the Simulator rides", 
    "Sightseeing", "Go to the Spinning rides"
]

# Activity columns to check
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

# Ensure each activity in the columns is valid and does not change the name
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities
valid_activity_df = []

# Process each row and find the valid activities
for index, row in df.iterrows():
    activity_indices = []
    for i in range(len(activity_columns)):  # Process all activity columns
        activity_name = row[activity_columns[i]]
        if activity_name in all_possible_activities:
            activity_indices.append(activity_name)
        else:
            activity_indices.append('Na')

    # Ensure the activity_indices list contains only valid activities
    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(len(activity_columns))])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Get all possible activities (to ensure the Sankey diagram includes all)
top_activities = all_possible_activities

# Filter the dataframe to include only valid activities
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_activities else 'Na'))

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Set the start node as "Sci-Fi City"
start_node = "Sci-Fi City"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']
    fourth = row['activity_4']
    fifth = row['activity_5']

    # Add "2 - ", "3 - ", "4 - ", "5 - " to the activities
    second = f"2 - {second}" if second != "Na" else "Na"
    third = f"3 - {third}" if third != "Na" else "Na"
    fourth = f"4 - {fourth}" if fourth != "Na" else "Na"
    fifth = f"5 - {fifth}" if fifth != "Na" else "Na"

    # Add to the dictionary if second, third, fourth, or fifth prefixed activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Calculate percentages for each activity in the dataset
total_activities = sum(activity_counts.values())
activity_percentage = {activity: (count / total_activities) * 100 for activity, count in activity_counts.items()}

# Format the node labels to include percentages, ensuring "Sci-Fi City" shows 100%
nodes_with_percentage = []
for node in nodes:
    if node == start_node:
        nodes_with_percentage.append(f"{node}<br>(100%)")
    elif " - " in node:
        # Format with line break for prefixed nodes like "2 - ", "3 - ", etc.
        base_activity = node.split(" - ")[1]
        percentage = activity_percentage.get(base_activity, 0)
        nodes_with_percentage.append(f"{node.split(' - ')[0]}: {base_activity}<br>({percentage:.2f}%)")
    else:
        # Format all other nodes with a line break between the activity name and percentage
        percentage = activity_percentage.get(node, 0)
        nodes_with_percentage.append(f"{node}<br>({percentage:.2f}%)")

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram with updated labels
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes_with_percentage
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="Theme Park Activity Flow from Sci-Fi City", font_size=12)

# Show the Sankey diagram
fig.show()


Theme Park Activity Flow from Sci-Fi City (3 activities)

In [37]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of all possible activities (excluding "Go to the Water-based rides" and "Go to the child-friendly rides")
all_possible_activities = [
    "Go to the rollercoasters", "Go to the F&B outlet", 
    "Go and shop for souvenirs", "Go for a water/restroom break", 
    "Go to the Haunted House", "Go to the Simulator rides", 
    "Sightseeing", "Go to the Spinning rides"
]

# Activity columns to check (only first 3 activities)
activity_columns = [
    'first_activity', 'second_activity', 'third_activity'
]

# Ensure each activity in the columns is valid and does not change the name
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities
valid_activity_df = []

# Process each row and find the valid activities
for index, row in df.iterrows():
    activity_indices = []
    for i in range(len(activity_columns)):  # Process only the first 3 activity columns
        activity_name = row[activity_columns[i]]
        if activity_name in all_possible_activities:
            activity_indices.append(activity_name)
        else:
            activity_indices.append('Na')

    # Ensure the activity_indices list contains only valid activities
    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (first 3 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(len(activity_columns))])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Get all possible activities (to ensure the Sankey diagram includes all)
top_activities = all_possible_activities

# Filter the dataframe to include only valid activities
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_activities else 'Na'))

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Set the start node as "Sci-Fi City"
start_node = "Sci-Fi City"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']

    # Add "2 - ", "3 - " to the activities
    second = f"2 - {second}" if second != "Na" else "Na"
    third = f"3 - {third}" if third != "Na" else "Na"

    # Add to the dictionary if second or third prefixed activities are not in activity_to_index
    for activity in [second, third]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1

# Calculate percentages for each activity in the dataset
total_activities = sum(activity_counts.values())
activity_percentage = {activity: (count / total_activities) * 100 for activity, count in activity_counts.items()}

# Format the node labels to include percentages, ensuring "Sci-Fi City" shows 100%
nodes_with_percentage = []
for node in nodes:
    if node == start_node:
        nodes_with_percentage.append(f"{node}<br>(100%)")
    elif " - " in node:
        # Format with line break for prefixed nodes like "2 - ", "3 - ", etc.
        base_activity = node.split(" - ")[1]
        percentage = activity_percentage.get(base_activity, 0)
        nodes_with_percentage.append(f"{node.split(' - ')[0]}: {base_activity}<br>({percentage:.2f}%)")
    else:
        # Format all other nodes with a line break between the activity name and percentage
        percentage = activity_percentage.get(node, 0)
        nodes_with_percentage.append(f"{node}<br>({percentage:.2f}%)")

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram with updated labels
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes_with_percentage
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="Theme Park Activity Flow from Sci-Fi City", font_size=12)

# Show the Sankey diagram
fig.show()


Theme Park Activity Flow from Ancient Egypt

In [54]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of valid Ancient Egypt activities (simplified names)
ancient_egypt_activities = [
    "Rollercoasters", "F&B outlet", "Shop for souvenirs", 
    "Water/restroom break", "Haunted House", "Sightseeing"
]

# Mapping from dataset activity names to simplified activity names
activity_mapping = {
    "Go to the rollercoasters": "Rollercoasters",
    "Go to the F&B outlet": "F&B outlet",
    "Go and shop for souvenirs": "Shop for souvenirs",
    "Go for a water/restroom break": "Water/restroom break",
    "Go to the Haunted House": "Haunted House",
    "Sightseeing": "Sightseeing"
}

# Activity columns to check
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

# Clean the activity columns (strip and handle Na values)
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities (up to the 5th activity)
valid_activity_df = []

# Process each row and find the valid activities for Ancient Egypt
for index, row in df.iterrows():
    activity_indices = []
    count = 0
    
    # Go through the first 5 activities and select the valid ones
    for i in range(5):  # Only check the first 5 activities
        activity_name = row[activity_columns[i]]
        
        # If activity is valid, add it, otherwise skip to the next
        if activity_name in activity_mapping:
            activity_indices.append(activity_mapping[activity_name])
            count += 1
        elif count < 5:  # If we haven't yet captured 5 valid activities, get the next valid one
            for j in range(i+1, len(activity_columns)):
                next_activity_name = row[activity_columns[j]]
                if next_activity_name in activity_mapping:
                    activity_indices.append(activity_mapping[next_activity_name])
                    count += 1
                    break
    
    # Ensure the activity_indices has exactly 5 activities, filling up missing ones with 'Na'
    while len(activity_indices) < 5:
        activity_indices.append('Na')

    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (up to 5 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(5)])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Calculate percentage for each activity in each of the columns (activity_1 to activity_5)
for col in ['activity_1', 'activity_2', 'activity_3', 'activity_4', 'activity_5']:
    activity_percentage = valid_activity_df[col].value_counts(normalize=True) * 100
    # Update the label with the percentage for each activity
    valid_activity_df[col] = valid_activity_df[col].map(lambda x: f"{x}<br>({activity_percentage.get(x, 0):.2f}%)" if x != 'Na' else 'Na')

# Collect all activities for the nodes (excluding "Na")
activities = valid_activity_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Add the start node "Ancient Egypt" to the nodes list
start_node = "Ancient Egypt"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Manually add 100% for Ancient Egypt
nodes[0] = f"{start_node}<br>(100%)"

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in valid_activity_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']
    fourth = row['activity_4']
    fifth = row['activity_5']

    # Add to the dictionary if second, third, fourth, or fifth activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="Ancient Egypt Zone Activity Flow", font_size=12)

# Show the Sankey diagram
fig.show()


Theme Park Activity Flow from The Lost World

In [72]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of valid Lost World activities (simplified names)
lost_world_activities = [
    "Rollercoasters", "Water-based rides", "F&B outlet", "Shop for souvenirs",
    "Water/restroom break", "Child-friendly rides", "Haunted House", "Sightseeing"
]

# Mapping from dataset activity names to simplified activity names for Lost World zone
activity_mapping = {
    "Go to the rollercoasters": "Rollercoasters",
    "Go to the Water-based rides": "Water-based rides",
    "Go to the F&B outlet": "F&B outlet",
    "Go and shop for souvenirs": "Shop for souvenirs",
    "Go for a water/restroom break": "Water/restroom break",
    "Go to the child-friendly rides": "Child-friendly rides",
    "Go to the Haunted House": "Haunted House",
    "Sightseeing": "Sightseeing"
}

# Activity columns to check
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

# Clean the activity columns (strip and handle Na values)
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities (up to the 5th activity)
valid_activity_df = []

# Process each row and find the valid activities for Lost World
for index, row in df.iterrows():
    activity_indices = []
    count = 0
    
    # Go through the first 5 activities and select the valid ones
    for i in range(5):  # Only check the first 5 activities
        activity_name = row[activity_columns[i]]
        
        # If activity is valid, add it, otherwise skip to the next
        if activity_name in activity_mapping:
            activity_indices.append(activity_mapping[activity_name])
            count += 1
        elif count < 5:  # If we haven't yet captured 5 valid activities, get the next valid one
            for j in range(i+1, len(activity_columns)):
                next_activity_name = row[activity_columns[j]]
                if next_activity_name in activity_mapping:
                    activity_indices.append(activity_mapping[next_activity_name])
                    count += 1
                    break
    
    # Ensure the activity_indices has exactly 5 activities, filling up missing ones with 'Na'
    while len(activity_indices) < 5:
        activity_indices.append('Na')

    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (up to 5 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(5)])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Define the top activities for the Lost World zone (simplified names)
top_activities = lost_world_activities

# Filter the dataframe to include only valid activities (up to 5 activities)
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_activities else 'Na'))

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Add the start node "The Lost World" to the nodes list
start_node = "The Lost World"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']
    fourth = row['activity_4']
    fifth = row['activity_5']

    # Add "2: ", "3: ", "4: ", "5: " to the activities
    second = f"2: {second}" if second != "Na" else "Na"
    third = f"3: {third}" if third != "Na" else "Na"
    fourth = f"4: {fourth}" if fourth != "Na" else "Na"
    fifth = f"5: {fifth}" if fifth != "Na" else "Na"

    # Add to the dictionary if second, third, fourth, or fifth prefixed activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

# Calculate the total number of links
total_links = sum(link_counts.values())

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Calculate percentages for each link
percentages = [count / total_links * 100 for count in values]

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        label=[f"{value:.1f}%" for value in percentages]  # Add percentage labels to the links
    )
))

# Update layout for better visualization
fig.update_layout(title="The Lost World Zone Activity Flow", font_size=12)

# Show the Sankey diagram
fig.show()


Far Far Away

In [None]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of valid Far Far Away activities (simplified names)
far_far_away_activities = [
    "Rollercoasters", "Shop for souvenirs", "Water/restroom break", 
    "Child-friendly rides", "Haunted House", "Simulator rides", "Sightseeing"
]

# Mapping from dataset activity names to simplified activity names for Far Far Away zone
activity_mapping = {
    "Go to the rollercoasters": "Rollercoasters",
    "Go and shop for souvenirs": "Shop for souvenirs",
    "Go for a water/restroom break": "Water/restroom break",
    "Go to the child-friendly rides": "Child-friendly rides",
    "Go to the Haunted House": "Haunted House",
    "Go to the Simulator rides": "Simulator rides",
    "Sightseeing": "Sightseeing"
}

# Activity columns to check
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

# Clean the activity columns (strip and handle Na values)
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities (up to the 5th activity)
valid_activity_df = []

# Process each row and find the valid activities for Far Far Away
for index, row in df.iterrows():
    activity_indices = []
    count = 0
    
    # Go through the first 5 activities and select the valid ones
    for i in range(5):  # Only check the first 5 activities
        activity_name = row[activity_columns[i]]
        
        # If activity is valid, add it, otherwise skip to the next
        if activity_name in activity_mapping:
            activity_indices.append(activity_mapping[activity_name])
            count += 1
        elif count < 5:  # If we haven't yet captured 5 valid activities, get the next valid one
            for j in range(i+1, len(activity_columns)):
                next_activity_name = row[activity_columns[j]]
                if next_activity_name in activity_mapping:
                    activity_indices.append(activity_mapping[next_activity_name])
                    count += 1
                    break
    
    # Ensure the activity_indices has exactly 5 activities, filling up missing ones with 'Na'
    while len(activity_indices) < 5:
        activity_indices.append('Na')

    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (up to 5 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(5)])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Define the top activities for the Far Far Away zone (simplified names)
top_activities = far_far_away_activities

# Filter the dataframe to include only valid activities (up to 5 activities)
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_activities else 'Na'))

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Add the start node "Far Far Away" to the nodes list
start_node = "Far Far Away"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']
    fourth = row['activity_4']
    fifth = row['activity_5']

    # Add "2 - ", "3 - ", "4 - ", "5 - " to the activities
    second = f"2 - {second}" if second != "Na" else "Na"
    third = f"3 - {third}" if third != "Na" else "Na"
    fourth = f"4 - {fourth}" if fourth != "Na" else "Na"
    fifth = f"5 - {fifth}" if fifth != "Na" else "Na"

    # Add to the dictionary if second, third, fourth, or fifth prefixed activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="Far Far Away Zone Activity Flow", font_size=12)

# Show the Sankey diagram
fig.show()


New York

In [71]:
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

# Load the dataset
file_path = '../data/Updated_Theme_Parks_Survey_Responses A14.csv'
df = pd.read_csv(file_path)

# Define the list of valid New York activities (simplified names)
new_york_activities = [
    "F&B outlet", "Shop for souvenirs", "Water/restroom break", 
    "Child-friendly rides", "Haunted House", "Sightseeing", "Simulator rides"
]

# Mapping from dataset activity names to simplified activity names for New York zone
activity_mapping = {
    "Go to the F&B outlet": "F&B outlet",
    "Go and shop for souvenirs": "Shop for souvenirs",
    "Go for a water/restroom break": "Water/restroom break",
    "Go to the child-friendly rides": "Child-friendly rides",
    "Go to the Haunted House": "Haunted House",
    "Sightseeing": "Sightseeing",
    "Go to the Simulator rides": "Simulator rides"
}

# Activity columns to check
activity_columns = [
    'first_activity', 'second_activity', 'third_activity', 'fourth_activity', 'fifth_activity', 
    'sixth_activity', 'seventh_activity', 'eighth_activity', 'ninth_activity', 'tenth_activity',
    'eleventh_activity', 'twelfth_activity', 'thirteenth_activity', 'fourteenth_activity'
]

# Clean the activity columns (strip and handle Na values)
for col in activity_columns:
    df[col] = df[col].str.strip().fillna('Na')

# Initialize the list for valid activities (up to the 5th activity)
valid_activity_df = []

# Process each row and find the valid activities for New York
for index, row in df.iterrows():
    activity_indices = []
    count = 0
    
    # Go through the first 5 activities and select the valid ones
    for i in range(5):  # Only check the first 5 activities
        activity_name = row[activity_columns[i]]
        
        # If activity is valid, add it, otherwise skip to the next
        if activity_name in activity_mapping:
            activity_indices.append(activity_mapping[activity_name])
            count += 1
        elif count < 5:  # If we haven't yet captured 5 valid activities, get the next valid one
            for j in range(i+1, len(activity_columns)):
                next_activity_name = row[activity_columns[j]]
                if next_activity_name in activity_mapping:
                    activity_indices.append(activity_mapping[next_activity_name])
                    count += 1
                    break
    
    # Ensure the activity_indices has exactly 5 activities, filling up missing ones with 'Na'
    while len(activity_indices) < 5:
        activity_indices.append('Na')

    valid_activity_df.append(activity_indices)

# Create a new dataframe with the valid activities (up to 5 activities)
valid_activity_df = pd.DataFrame(valid_activity_df, columns=[f"activity_{i+1}" for i in range(5)])

# Remove rows where the first activity is 'Na' (this filters out empty or invalid rows)
valid_activity_df = valid_activity_df[valid_activity_df['activity_1'] != "Na"]

# Flatten the activity columns and count the frequency of each activity
activities = valid_activity_df.values.flatten().tolist()
activity_counts = Counter([activity for activity in activities if activity != "Na"])

# Define the top activities for the New York zone (simplified names)
top_activities = new_york_activities

# Filter the dataframe to include only valid activities (up to 5 activities)
filtered_df = valid_activity_df.apply(lambda col: col.map(lambda x: x if x in top_activities else 'Na'))

# Collect all activities for the nodes (excluding "Na")
activities = filtered_df.values.flatten().tolist()
activities = [activity for activity in activities if activity != "Na"]

# Add the start node "New York" to the nodes list
start_node = "New York"
nodes = [start_node] + list(pd.Series(activities).unique())

# Create a dictionary for activity to index mapping
activity_to_index = {activity: index for index, activity in enumerate(nodes)}

# Initialize the list to count occurrences of each transition (source-target pair)
link_counts = Counter()

# Loop through the dataframe to create links for the Sankey diagram
for index, row in filtered_df.iterrows():
    first = row['activity_1']
    second = row['activity_2']
    third = row['activity_3']
    fourth = row['activity_4']
    fifth = row['activity_5']

    # Add "2 - ", "3 - ", "4 - ", "5 - " to the activities
    second = f"2 - {second}" if second != "Na" else "Na"
    third = f"3 - {third}" if third != "Na" else "Na"
    fourth = f"4 - {fourth}" if fourth != "Na" else "Na"
    fifth = f"5 - {fifth}" if fifth != "Na" else "Na"

    # Add to the dictionary if second, third, fourth, or fifth prefixed activities are not in activity_to_index
    for activity in [second, third, fourth, fifth]:
        if activity != "Na" and activity not in activity_to_index:
            activity_to_index[activity] = len(activity_to_index)
            nodes.append(activity)

    # Track links between consecutive activities, ensuring they are valid (not "Na")
    if first != "Na":
        link_counts[(start_node, first)] += 1
    if first != "Na" and second != "Na" and first != second:
        link_counts[(first, second)] += 1
    if second != "Na" and third != "Na" and second != third:
        link_counts[(second, third)] += 1
    if third != "Na" and fourth != "Na" and third != fourth:
        link_counts[(third, fourth)] += 1
    if fourth != "Na" and fifth != "Na" and fourth != fifth:
        link_counts[(fourth, fifth)] += 1

# Prepare lists for sources, targets, and values based on the link counts
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source != "Na" and target != "Na":  # Only add links that are valid
        sources.append(activity_to_index[source])
        targets.append(activity_to_index[target])
        values.append(count)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout for better visualization
fig.update_layout(title="New York Zone Activity Flow", font_size=12)

# Show the Sankey diagram
fig.show()
