In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
olympic_data2=pd.read_csv('../data/clean/dataset_olympics_clean/teams_not_duplicated_summer_olympics_1996-2016_deduplicate_team_medals.csv')

In [None]:
#Hypothesis: Medalists under 22 years old are more likely to win again at the next olympics. (when they are still within their optimal age)

# Filter athlets into groups
group_under_22_medals = olympic_data2[(olympic_data2['Age'] < 22) & (olympic_data2['Medal'] != 'No Medal')]
group_over_22_medals = olympic_data2[(olympic_data2['Age'] >= 22) & (olympic_data2['Medal'] != 'No Medal')]
group_under_22_no_medals = olympic_data2[(olympic_data2['Age'] < 22) & (olympic_data2['Medal'] == 'No Medal')]
group_over_22_no_medals = olympic_data2[(olympic_data2['Age'] >= 22) & (olympic_data2['Medal'] == 'No Medal')]

# Function to calculate repeat medalists for a group
def calculate_repeat_medalists(group, olympic_data2):
    # Find athletes from the group in the next Olympics
    next_olympics = pd.merge(
        group,
        olympic_data2,
        left_on=['Name', 'region'],
        right_on=['Name', 'region'],
        suffixes=('_current', '_next')
    )
    next_olympics = next_olympics[next_olympics['Year_next'] == next_olympics['Year_current'] + 4]

    # Calculate repeat medalists
    next_olympics['Won_Medal_Next'] = next_olympics['Medal_next'] != 'No Medal'
    repeat_medalists = next_olympics['Won_Medal_Next'].sum()
    total_athletes = len(group)
    percentage_repeat_medalists = (repeat_medalists / total_athletes) * 100 if total_athletes > 0 else 0
    return total_athletes, repeat_medalists, percentage_repeat_medalists

# Calculate statistics for each group
groups = {
    "Under 22 with Medals": group_under_22_medals,
    "Over 22 with Medals": group_over_22_medals,
    "Under 22 without Medals": group_under_22_no_medals,
    "Over 22 without Medals": group_over_22_no_medals,
}

results = {}
for group_name, group_data in groups.items():
    total, repeat, percentage = calculate_repeat_medalists(group_data, olympic_data2)
    results[group_name] = {
        "Total Athletes": total,
        "Repeat Medalists": repeat,
        "Percentage Repeat Medalists": percentage,
    }

# Display the results
for group_name, stats in results.items():
    print(f"{group_name}:")
    print(f"  Total Athletes: {stats['Total Athletes']}")
    print(f"  Repeat Medalists: {stats['Repeat Medalists']}")
    print(f"  Percentage Repeat Medalists: {stats['Percentage Repeat Medalists']:.2f}%\n")

In [None]:


# Example data extracted from the previous results dictionary
group_names = list(results.keys())
total_athletes = [results[group]["Total Athletes"] for group in group_names]
repeat_medalists = [results[group]["Repeat Medalists"] for group in group_names]

# Pie Chart: Proportion of Repeat Medalists for Each Group
for i, group in enumerate(group_names):
    plt.figure(figsize=(6, 6))
    labels = ['Repeat Medalists', 'Non-Repeat Medalists']
    sizes = [repeat_medalists[i], total_athletes[i] - repeat_medalists[i]]
    colors = ['#FCB131', '#0081C8']  # Yellow for repeat medalists, Blue for non-repeat medalists
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    plt.title(f'Repeat Medalists Proportion: {group}')
    plt.show()

In [None]:
results