In [None]:
import pandas as pd 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
bios_clean = pd.read_csv('clean-data/bios.csv')
results_clean = pd.read_csv('clean-data/results.csv')

print("Bios Clean Data Head:")
print(bios_clean.head())
print("\nResults Clean Data Head:")
print(results_clean.head())


In [None]:
# Plotting height distribution
plt.figure(figsize=(10, 6))
sns.histplot(bios_clean['height_cm'].dropna(), kde=True)
plt.title('Distribution of Athlete Height (cm)')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
plt.savefig('height_distribution.png') # Save plot
plt.show()

# Plotting weight distribution
plt.figure(figsize=(10, 6))
sns.histplot(bios_clean['weight_kg'].dropna(), kde=True, color='orange')
plt.title('Distribution of Athlete Weight (kg)')
plt.xlabel('Weight (kg)')
plt.ylabel('Frequency')
plt.savefig('weight_distribution.png') # Save plot
plt.show()


In [None]:
# Medal counts by NOC
medal_counts = results_clean.groupby('noc')['medal'].count().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 7))
sns.barplot(x=medal_counts.index, y=medal_counts.values)
plt.title('Top 10 NOCs by Medal Count')
plt.xlabel('National Olympic Committee (NOC)')
plt.ylabel('Medal Count')
plt.savefig('top_n_nocs_medals.png') # Save plot
plt.show()

# Medals over time (example for Gold medals)
gold_medals_over_time = results_clean[results_clean['medal'] == 'Gold'].groupby('year').size()
plt.figure(figsize=(12, 7))
gold_medals_over_time.plot(kind='line', marker='o')
plt.title('Gold Medals Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Gold Medals')
plt.grid(True)
plt.savefig('gold_medals_over_time.png') # Save plot
plt.show()
