In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
olympic_data2 = pd.read_csv("../data/clean/dataset_olympics_clean/teams_not_duplicated_summer_olympics_1996-2016_deduplicate_team_medals.csv")

In [None]:
# Define custom Olympic colors for charts
olympic_colors = {
    "blue": "#0081C8",
    "yellow": "#FCB131",
    "black": "#000000",
    "green": "#00A651",
    "red": "#EE334E"
}

In [None]:
#Hypothesis: Younger athletes are more likely to win medals


# Filter only medalists
medalists = olympic_data2[olympic_data2['Medal'].notna() & (olympic_data2['Medal'] != 'No Medal')]

# Calculate median age of medalists
median_age = medalists['Age'].median()

# Histogram: Age distribution of medalists
plt.figure(figsize=(10, 6))
sns.histplot(medalists['Age'], bins=20, kde=True, color=olympic_colors["blue"])
plt.axvline(median_age, color=olympic_colors["red"], linestyle='dashed', linewidth=2, label=f'Median Age: {median_age}')
plt.xlabel("Age of Medalists", color=olympic_colors["black"])
plt.ylabel("Frequency", color=olympic_colors["black"])
plt.title("Distribution of Medalist Ages", color=olympic_colors["black"])
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Scatter plot of Age vs. Medals Won
medal_counts = medalists.groupby('Age').size().reset_index(name='Medal Count')

plt.figure(figsize=(10, 6))
sns.regplot(
    x=medal_counts['Age'], 
    y=medal_counts['Medal Count'], 
    scatter_kws={'s': 50, 'color': olympic_colors["yellow"]}, 
    line_kws={"color": olympic_colors["red"]}  # Regression line
)
plt.xlabel("Age of Athletes", color=olympic_colors["black"])
plt.ylabel("Number of Medals Won", color=olympic_colors["black"])
plt.title("Relationship Between Age and Medals Won", color=olympic_colors["black"])
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.show()

In [None]:
#Correlation Analysis between Athlete Age and Number of Medals Won

medal_counts.corr()

In [None]:
#Understanding outliers from the median peak age.

#Group by sport and calculate median/mean age of medalists
sport_age_stats = medalists.groupby('Sport')['Age'].agg(['mean', 'median']).reset_index()
print(sport_age_stats)