** Geographic Factors Analysis **
- Efficiency in turning athlete participation into medals will increase in host country
- Athletes from the host country win more medals

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
olympic_data=pd.read_csv('../data/clean/dataset_olympics_clean/athlete_events.csv')
olympic_podium=pd.read_csv('../data/clean/dataset_olympics_clean/podium_1996-2016.csv')
noc_regions = pd.read_csv('../data/clean/dataset_olympics_clean/noc_regions.csv')

In [None]:
#Number of countries and athletes participating is inconsistent before 1996 - use only data post 1996
condition_year_1996=olympic_data["Year"]>=1996
olympic_data=olympic_data[condition_year_1996]
condition_summer=olympic_data["Season"]=="Summer"
olympic_data=olympic_data[condition_summer]

In [None]:
# Create a mapping dictionary for corrections
# Replace incorrect names with correct ones

corrections = {
    "Athina": "Athens",
    "Roma": "Rome",
    "Moskva": "Moscow",
    "Sankt Moritz": "St. Moritz"
}


olympic_data['City'] = olympic_data['City'].replace(corrections)

In [None]:
#Replace Na values as "No Medal" string as null values correspond to no winning athletes.
olympic_data["Medal"] = olympic_data["Medal"].apply(lambda x:"No Medal" if pd.isna(x) else x)

In [None]:
olympic_data.drop_duplicates(inplace = True)
print(olympic_data.duplicated().sum())

In [None]:
#Merge the datasets on the 'NOC' column
olympic_data = pd.merge(
    olympic_data, 
    noc_regions[['NOC', 'region']], 
    on='NOC', 
    how='left'
)

olympic_data = olympic_data.rename(columns={'region': 'Region'})

In [None]:
# Hypothesis: Efficiency in turning athlete participation into medals will increase in host country
# Aggregate Olympic data to calculate: Every combination of 'Year', 'NOC', and 'City' is unique. 
# Medal counts (Gold, Silver, Bronze, No Medal) grouped by Year, NOC, and City.
# To calculate the medal performance (efficiency factor), we count the total number of athletes who won medals, instead of counting the number of medals. This is because in team events, multiple athletes contribute to the win, so it’s more accurate to focus on the athletes who played a role in the victory.

medal_data = olympic_data.groupby(['Year', 'NOC', 'City', 'Region']).agg(
    Athlete_Count=('ID', 'nunique'),  # Count unique athlete IDs
    Athlete_Gold_Count=('Medal', lambda x: (x == 'Gold').sum()),
    Athlete_Silver_Count=('Medal', lambda x: (x == 'Silver').sum()),
    Athlete_Bronze_Count=('Medal', lambda x: (x == 'Bronze').sum()),
    Athlete_N_Count=('Medal', lambda x: (x == 'No Medal').sum())
).reset_index()

# Add a new column to calculate the total number of medals
medal_data['Athlete_Medal_Count'] = (
    medal_data['Athlete_Gold_Count'] + 
    medal_data['Athlete_Silver_Count'] + 
    medal_data['Athlete_Bronze_Count']
)

# Calculate Medal_Performance: total medals / Athlete_Count
medal_data['Medal_Total_Performance'] = (medal_data['Athlete_Medal_Count'] / medal_data['Athlete_Count']).round(2)
medal_data['Medal_Gold_Performance'] = (medal_data['Athlete_Gold_Count'] / medal_data['Athlete_Count']).round(2)


medal_data

In [None]:
brazil = medal_data[medal_data['Region'] == 'Brazil']
brazil

In [None]:
uk = medal_data[medal_data['Region'] == 'UK']
uk

In [None]:
greece = medal_data[medal_data['Region'] == 'Greece']
greece

In [None]:
aus = medal_data[medal_data['Region'] == 'Australia']
aus

In [None]:
# Medal Conversion Rate on Host Country
# USA 1996, AUS 2000, GRE 2004, CHN 2008, GBR 2012, BRA 2016
# List of NOCs to filter
host_region_years = [
    ('Brazil', 2012, 2016),
    ('UK', 2008, 2012),
    ('China', 2004, 2008),
    ('Greece', 2000, 2004),
    ('Australia', 1996, 2000)
]

# Create an empty list to store processed data
host_data_list = []

for region, non_host_year, host_year in host_region_years:
    
    # Extract data for Non-Host Year
    non_host_df = medal_data[(medal_data['Region'] == region) & (medal_data['Year'] == non_host_year)]
    
    # Extract data for Host Year
    host_df = medal_data[(medal_data['Region'] == region) & (medal_data['Year'] == host_year)]
    
    # Get Medal_Total_Performance values
    non_host_performance = non_host_df['Medal_Total_Performance'].sum() if not non_host_df.empty else 0
    host_performance = host_df['Medal_Total_Performance'].sum() if not host_df.empty else 0
    
    # Convert to percentage
    non_host_percentage = round(non_host_performance * 100, 2)
    host_percentage = round(host_performance * 100, 2)
    change_pp = round(host_percentage - non_host_percentage, 2)
    
    # Append to list
    host_data_list.append({'Country (Host Year)': f'{region} ({host_year})', 
                           'Non-Host': f'{non_host_percentage}%', 
                           'Host': f'{host_percentage}%', 
                           'Change (pp)': f'{change_pp} pp'})

# Convert list to DataFrame
host_data_medal_rate = pd.DataFrame(host_data_list)

host_data_medal_rate

In [None]:
styled_table = host_data_medal_rate.style.set_properties(**{
    'text-align': 'center'
}).set_table_styles([
    {'selector': 'thead th', 'props': [('background-color', '#0081C8'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},  
    {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', '#f9f9f9')]},  
    {'selector': 'tbody td', 'props': [('border', '1px solid black'), ('padding', '5px')]}
]).set_caption("**Medal Conversion Rate Comparison**")

styled_table

In [None]:
# Define the NOCs and years of interest
noc_of_interest = ['BRA', 'GBR', 'CHN', 'GRE', 'AUS']
region = ['Brazil', 'UK', 'China', 'Greece', 'Australia']
olympic_years = [2016, 2012, 2008, 2004, 2000]

# Filter the data by both NOC codes and Olympic years
filtered_data = medal_data[medal_data['NOC'].isin(noc_of_interest) & medal_data['Year'].isin(olympic_years)]

# Convert 'Year' to string for proper plotting
filtered_data['Year'] = filtered_data['Year'].astype(str)

# Ensure 'Medal_Total_Performance' exists in filtered data
if 'Medal_Total_Performance' not in filtered_data.columns:
    filtered_data['Medal_Total_Performance'] = 0

# Define Olympic colors
colors = {
    'BRA': '#EE334E',  
    'GBR': '#0081C8',  
    'CHN': '#FCB131',  
    'GRE': '#00A651',  
    'AUS': '#000000'   
}

# Create subplots
fig, axes = plt.subplots(nrows=len(noc_of_interest), ncols=1, figsize=(8, 12), sharex=True, sharey=False)

for ax, noc, region_name in zip(axes, noc_of_interest, region):  # Added region_name to loop
    noc_data = filtered_data[filtered_data['NOC'] == noc]
    
    # Plot Athlete Count with Olympic colors
    ax.plot(noc_data['Year'], noc_data['Athlete_Count'], color=colors[noc], linewidth=2, label='Athletes Participated')
    
    # Plot Medal Total Performance as dashed grey line
    ax.plot(noc_data['Year'], noc_data['Athlete_Medal_Count'], color='grey', linestyle='dashed', linewidth=2, label='Athletes who won Medal')
    
    # Highlight hosting year with marker and label
    for year in noc_data['Year']:
        if noc == 'BRA' and year == '2016':
            ax.scatter(year, noc_data[noc_data['Year'] == year]['Athlete_Count'], color=colors['BRA'], zorder=5)
            ax.text(year, noc_data[noc_data['Year'] == year]['Athlete_Count'].values[0], 'Host Year', 
                    color=colors['BRA'], fontsize=10, ha='right', va='bottom')
        elif noc == 'GBR' and year == '2012':
            ax.scatter(year, noc_data[noc_data['Year'] == year]['Athlete_Count'], color=colors['GBR'], zorder=5)
            ax.text(year, noc_data[noc_data['Year'] == year]['Athlete_Count'].values[0], 'Host Year', 
                    color=colors['GBR'], fontsize=10, ha='right', va='bottom')
        elif noc == 'CHN' and year == '2008':
            ax.scatter(year, noc_data[noc_data['Year'] == year]['Athlete_Count'], color=colors['CHN'], zorder=5)
            ax.text(year, noc_data[noc_data['Year'] == year]['Athlete_Count'].values[0], 'Host Year', 
                    color=colors['CHN'], fontsize=10, ha='right', va='bottom')
        elif noc == 'GRE' and year == '2004':
            ax.scatter(year, noc_data[noc_data['Year'] == year]['Athlete_Count'], color=colors['GRE'], zorder=5)
            ax.text(year, noc_data[noc_data['Year'] == year]['Athlete_Count'].values[0], 'Host Year', 
                    color=colors['GRE'], fontsize=10, ha='right', va='bottom')
        elif noc == 'AUS' and year == '2000':
            ax.scatter(year, noc_data[noc_data['Year'] == year]['Athlete_Count'], color=colors['AUS'], zorder=5)
            ax.text(year, noc_data[noc_data['Year'] == year]['Athlete_Count'].values[0], 'Host Year', 
                    color=colors['AUS'], fontsize=10, ha='right', va='bottom')
    
    # Set title using the region name
    ax.set_title(f'{region_name} Medal Performance Efficien', fontsize=10)  # Title with region name
    ax.set_ylabel('Count', fontsize=8)
    ax.legend(fontsize=8)
    ax.tick_params(axis='both', labelsize=8)
    ax.yaxis.set_visible(True)  # Ensure Y-axis is visible for each chart

    y_min, y_max = ax.get_ylim()  # Get the current y-axis limits
    ax.set_yticks(np.arange(0, y_max + 100, 100))  # Set y-ticks every 10

    # ?fig.savefig(f'{noc}_{region_name}_Medal_Performance.png', dpi=300)

# Set common x-axis label
axes[-1].set_xlabel('Year', fontsize=10)

# Adjust layout for clarity
plt.tight_layout()
plt.show()


In [None]:
# Decline in medal performance when a country hosts the Olympics
# More athletes from the host nation compete, but this does not necessarily mean more medals are won at the same rate.
# -> Lower Average Medal Conversion Rate

In [None]:
# Gold Medal Performance Growth Rate on Host Country
# USA 1996, AUS 2000, GRE 2004, CHN 2008, GBR 2012, BRA 2016
# List of NOCs to filter
host_region_years = [
    ('Brazil', 2012, 2016),
    ('UK', 2008, 2012),
    ('China', 2004, 2008),
    ('Greece', 2000, 2004),
    ('Australia', 1996, 2000)
]

# Create an empty list to store processed data
host_data_list = []

for region, non_host_year, host_year in host_region_years:
    
    # Extract data for Non-Host Year
    non_host_df = medal_data[(medal_data['Region'] == region) & (medal_data['Year'] == non_host_year)]
    
    # Extract data for Host Year
    host_df = medal_data[(medal_data['Region'] == region) & (medal_data['Year'] == host_year)]
    
    # Get Medal_Total_Performance values
    non_host_performance = non_host_df['Medal_Gold_Performance'].sum() if not non_host_df.empty else 0
    host_performance = host_df['Medal_Gold_Performance'].sum() if not host_df.empty else 0
    
    # Convert to percentage
    non_host_percentage = round(non_host_performance * 100, 2)
    host_percentage = round(host_performance * 100, 2)
    change_pp = round(host_percentage - non_host_percentage, 2)
    
    # Append to list
    host_data_list.append({'Country (Host Year)': f'{region} ({host_year})', 
                           'Non-Host': f'{non_host_percentage}%', 
                           'Host': f'{host_percentage}%', 
                           'Change (pp)': f'{change_pp} pp'})

# Convert list to DataFrame
host_data_gold_rate = pd.DataFrame(host_data_list)

host_data_gold_rate

In [None]:
# Athletes from the host country win more medals
filtered_df = olympic_podium[olympic_podium['NOC'].isin(['USA', 'AUS', 'GRE', 'CHN', 'GBR', 'BRA'])]

# Create a new DataFrame by grouping and counting the medals
olympic_podium_agg = (filtered_df.groupby(['Year', 'NOC', 'region', 'City', 'Medal'])
                       .size()
                       .unstack(fill_value=0)
                       .reset_index())

# Rename columns for clarity
olympic_podium_agg = olympic_podium_agg.rename(columns={'Gold': 'Gold', 'Silver': 'Silver', 'Bronze': 'Bronze'})

# Rename 'region' column to 'Region'
olympic_podium_agg = olympic_podium_agg.rename(columns={'region': 'Region'})

# Reorder the columns to match the requested structure
olympic_podium_agg = olympic_podium_agg[['Year', 'NOC', 'Region', 'City', 'Gold', 'Silver', 'Bronze']]

# Add the Total column as the sum of Gold, Silver, and Bronze
olympic_podium_agg['Total'] = olympic_podium_agg[['Gold', 'Silver', 'Bronze']].sum(axis=1)
olympic_podium_agg['Year'] = olympic_podium_agg['Year'].astype(str)

olympic_podium_agg.head(10)

In [None]:
hosting_info = {
    1996: 'USA', 
    2000: 'AUS', 
    2004: 'GRE', 
    2008: 'CHN', 
    2012: 'GBR', 
    2016: 'BRA'
}

colors = {
    'BRA': '#EE334E',  
    'GBR': '#0081C8',  
    'CHN': '#FCB131',  
    'GRE': '#00A651',  
    'AUS': '#000000',
    'USA': '#818589'
}

# Plot the data for each NOC (country) from the original DataFrame
plt.figure(figsize=(10, 6))

# Iterate over each NOC
for noc in olympic_podium_agg['NOC'].unique():
    # Filter the data for the specific NOC
    noc_data = olympic_podium_agg[olympic_podium_agg['NOC'] == noc]
    
    # Plot the data for the country with a gray line and increase line width
    plt.plot(noc_data['Year'], noc_data['Total'], color='grey', alpha=0.6, linewidth=2)

    # For hosting year, plot a colored marker and add the country name label
    for year in noc_data['Year']:
        if noc == 'BRA' and year == '2016':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['BRA'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'Brazil', color=colors['BRA'], fontsize=10, ha='right', va='bottom')
        elif noc == 'GBR' and year == '2012':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['GBR'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'UK', color=colors['GBR'], fontsize=10, ha='right', va='bottom')
        elif noc == 'CHN' and year == '2008':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['CHN'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'China', color=colors['CHN'], fontsize=10, ha='right', va='bottom')
        elif noc == 'GRE' and year == '2004':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['GRE'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'Greece', color=colors['GRE'], fontsize=10, ha='right', va='bottom')
        elif noc == 'AUS' and year == '2000':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['AUS'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'Australia', color=colors['AUS'], fontsize=10, ha='right', va='bottom')
        elif noc == 'USA' and year == '1996':
            plt.scatter(year, noc_data[noc_data['Year'] == year]['Total'], color=colors['USA'], zorder=5)
            plt.text(year, noc_data[noc_data['Year'] == year]['Total'].values[0], 'USA', color=colors['USA'], fontsize=10, ha='right', va='bottom')

# Adding labels and title
plt.xlabel('Year')
plt.ylabel('Total Medal Count')
plt.title('Number of Olympic Medals for Host Country')

plt.tight_layout()
plt.show()

In [None]:
aus_m = olympic_podium_agg[olympic_podium_agg['Region'] == 'Australia']
aus_m

In [None]:
greece_m = olympic_podium_agg[olympic_podium_agg['Region'] == 'Greece']
greece_m

In [None]:
china_m = olympic_podium_agg[olympic_podium_agg['NOC'] == 'CHN']
china_m

In [None]:
brazil_m = olympic_podium_agg[olympic_podium_agg['NOC'] == 'BRA']
brazil_m

In [None]:
uk_m = olympic_podium_agg[olympic_podium_agg['NOC'] == 'GBR']
uk_m

In [None]:
# List of countries of interest
noc_of_interest = ['BRA', 'GBR', 'CHN', 'GRE', 'AUS']

# Define the host years and non-host years
host_region_years = [
    ('Brazil', "2012", "2016"),
    ('UK', "2008", "2012"),
    ('China', "2004", "2008"),
    ('Greece', "2000", "2004"),
    ('Australia', "1996", "2000")
]

# Step 1: Filter the olympic_podium_agg DataFrame for noc_of_interest
olympic_podium_filtered = olympic_podium_agg[olympic_podium_agg['NOC'].isin(noc_of_interest)]

# Step 2: Create a list to hold the data for the new DataFrame
host_change_data = []

# Step 3: Loop through each region and extract the relevant data
for region, non_host_year, host_year in host_region_years:
    # Filter data for the non-host year
    non_host_data = olympic_podium_filtered[(olympic_podium_filtered['Year'] == non_host_year) &
                                           (olympic_podium_filtered['Region'] == region)]
    non_host_total = non_host_data['Total'].sum()

    # Filter data for the host year
    host_data = olympic_podium_filtered[(olympic_podium_filtered['Year'] == host_year) &
                                       (olympic_podium_filtered['Region'] == region)]
    host_total = host_data['Total'].sum()

    # Calculate the change percentage
    if non_host_total != 0:
        change_percent = round((host_total - non_host_total) / non_host_total * 100, 0)
    else:
        change_percent = 0

    # Append the region data to the host_change_data list
    host_change_data.append({
        'Country (Host Year)': f'{region} ({host_year})',
        'Non-Host': non_host_total,
        'Host': host_total,
        'Change (%)': f'{change_percent}%'
    })

# Step 4: Create the new DataFrame
olympic_podium_host_change = pd.DataFrame(host_change_data)

# Show the new DataFrame
olympic_podium_host_change


In [None]:
df = pd.DataFrame(olympic_podium_host_change)

# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Create a seaborn scatterplot for the dumbbell chart
sns.scatterplot(x='Non-Host', y='Country (Host Year)', data=df, color='#0081C8', s=250, label='Non-Host', edgecolor='w', linewidth=2)
sns.scatterplot(x='Host', y='Country (Host Year)', data=df, color='#FCB131', s=250, label='Host', edgecolor='w', linewidth=2)

# Add lines to connect Non-Host and Host points
for index, row in df.iterrows():
    # Draw the line
    plt.plot([row['Non-Host'], row['Host']], [row['Country (Host Year)'], row['Country (Host Year)']], color='gray', lw=2)

    # Add an arrow to indicate the direction of change
    plt.annotate('', xy=(row['Host'], index), xytext=(row['Non-Host'], index),
                 arrowprops=dict(arrowstyle='->', color='gray', lw=2, shrinkA=0))

# Add text labels for Change (%) in the format "Host (+Change %)"
for index, row in df.iterrows():
    label = f"{row['Host']} (+{row['Change (%)']})"
    plt.text(row['Host'] + 1, row['Country (Host Year)'], label, ha='left', va='center', fontsize=12)

# Customize plot
plt.title('Change in number of medals won by host country, compared with prior Olympic')
plt.xlabel('Total Medal Count')
plt.ylabel('Country (Host Year)')
plt.grid(True)
plt.tight_layout()
# plt.savefig("Host_change_medals_won.png", format='png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Create the DataFrame (assuming olympic_podium_agg is already created)
df2 = pd.DataFrame(olympic_podium_agg)

# List of NOCs you want to focus on
noc_of_interest = ['BRA', 'GBR', 'CHN', 'GRE', 'AUS']

# Colors for Gold, Silver, and Bronze
colors = ['#f0c05a', 'lightgray', '#a97142']

# Hosting years information
hosting_info = { 
    "2000": 'AUS', 
    "2004": 'GRE', 
    "2008": 'CHN', 
    "2012": 'GBR', 
    "2016": 'BRA'
}

def plot_ridgeline_chart(df2, noc_of_interest, colors, hosting_info):
    plot_data = []
    
    for noc in noc_of_interest:
        noc_data = df2[df2['NOC'] == noc]
        years = noc_data['Year'].values
        gold = noc_data['Gold'].values
        silver = noc_data['Silver'].values
        bronze = noc_data['Bronze'].values
        
        # Stack data: Gold, Silver, and Bronze (no smoothing)
        plot_data.append(np.stack([gold, silver, bronze], axis=0))
        
    # Plot the ridgeline chart
    fig, axes = plt.subplots(len(noc_of_interest), 1, figsize=(12, 10), sharex=True, sharey=False)
    fig.suptitle('Olympic Medal Trends by Host Country', fontsize=16, fontweight='bold')

    # Adjust the spacing around subplots (if needed)
    fig.subplots_adjust(hspace=0.3)  # Increase space between subplots if necessary
    
    for i, (ax, noc, data) in enumerate(zip(axes, noc_of_interest, plot_data)):
        years = df2[df2['NOC'] == noc]['Year'].values
        
        ax.stackplot(years, data, labels=['Gold', 'Silver', 'Bronze'], colors=colors, alpha=0.8)
        
        # Add a vertical line for the hosting year and label medal counts within the stacked areas
        for year in years:
            if str(year) in hosting_info and hosting_info[str(year)] == noc:
                ax.axvline(x=year, color='grey', linestyle='--', label=f'Hosting Year ({year})')
                
                # Find the index for the hosting year and add the text labels
                year_index = np.where(years == year)[0][0]
                ax.text(year, data[0, year_index] / 2, f'Gold: {int(data[0, year_index])}', ha='center', fontsize=10, color='black',fontweight='bold')
                ax.text(year, data[0, year_index] + data[1, year_index] / 2, f'Silver: {int(data[1, year_index])}', ha='center', fontsize=10, color='black')
                ax.text(year, data[0, year_index] + data[1, year_index] + data[2, year_index] / 2, f'Bronze: {int(data[2, year_index])}', ha='center', fontsize=10, color='black')
        
        # Remove the frame and y-axis
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_yticks([])

        # Use annotate to place Region text to the left of the chart
        ax.annotate(
            f'{df2[df2["NOC"] == noc]["Region"].values[0]}',
            xy=(0.0, 0.5),  # Position at the middle vertically
            xycoords='axes fraction',  # Relative to the axes' coordinates
            ha='right',  # Align the text to the right
            va='center',  # Vertically center it
            fontsize=12,
            color='black',
            fontweight='bold'
        )

    # Set the x-axis label
    ax.set_xlabel('Year', fontsize=12)
    # Adjust layout to ensure titles and labels fit
    plt.tight_layout()
    plt.show()

# Call the function to plot
plot_ridgeline_chart(df2, noc_of_interest, colors, hosting_info)