In [73]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [74]:
data = pd.read_csv("olympic_results.csv")
data_clean = data.drop(columns = ['participant_type', 'athletes', 'rank_equal', 'country_code', 'athlete_url', 'athlete_full_name', 'value_unit', 'value_type'])
# split 'slug_game' into 'city' and 'year'
data_clean[['city', 'year']] = data['slug_game'].str.rsplit('-', n=1, expand=True)
# drop 'slug_game' column
data_clean = data_clean.drop(columns=['slug_game'])
# convert year to int
data_clean['year'] = data_clean['year'].astype(int)
# define known summer and winter olympics years
summer_years = set(range(1896, 2025, 4)) - {1916, 1940, 1944}  # excluded cancelled years
winter_years = set(range(1924, 2025, 4)) | {1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022} - {1940, 1944}
# assign season
data_clean['season'] = data_clean['year'].apply(
    lambda y: 'summer' if y in summer_years else 'winter' if y in winter_years else 'unknown'
)
# add column for gender (men, women, mixed)
data_clean['gender'] = data_clean['event_title'].str.extract(r'(?i)(men|women|mixed)')[0].str.lower()
# add column for team/doubles/individual event
conditions = [
    data_clean['event_title'].str.contains(r'doubles', case=False, na=False),
    data_clean['event_title'].str.contains(r'team', case=False, na=False)
]
choices = ['doubles', 'team']
data_clean['event_type'] = np.select(conditions, choices, default='individual')

# medal information dataframe
# filter only rows where a medal was awarded
medals_only = data_clean[data_clean['medal_type'].notna()].copy()
# group and aggregate counts
medals_df = medals_only.groupby('country_name').agg(
    total_medals=('medal_type', 'count'),
    total_gold_medals=('medal_type', lambda x: (x.str.lower() == 'gold').sum()),
    total_silver_medals=('medal_type', lambda x: (x.str.lower() == 'silver').sum()),
    total_bronze_medals=('medal_type', lambda x: (x.str.lower() == 'bronze').sum()),
    total_summer_medals=('season', lambda x: (x == 'summer').sum()),
    total_winter_medals=('season', lambda x: (x == 'winter').sum()),
    total_team_medals=('event_type', lambda x: (x == 'team').sum()),
    total_doubles_medals=('event_type', lambda x: (x == 'doubles').sum()),
    total_individual_medals=('event_type', lambda x: (x == 'individual').sum()),
    total_mens_medals=('gender', lambda x: (x == 'Men').sum()),
    total_womens_medals=('gender', lambda x: (x == 'Women').sum()),
    total_mixed_medals =('gender', lambda x: (x == 'Mixed').sum())
).reset_index()

# additional country features dataframe
url = "https://www.worldometers.info/gdp/gdp-by-country/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# find the table with GDP data
table = soup.find('table')  
# convert the table to a DataFrame
gdp_df = pd.read_html(str(table))[0]
# clean the dataframe by dropping unwanted columns
gdp_df = gdp_df.iloc[:, [1,2,5]]
# rename the columns
gdp_df.columns = ['country_name', 'gdp', 'population']
# drop the $ in the gdp column and convert to numeric
gdp_df = gdp_df.replace({'\$': '', ',': ''}, regex=True)
gdp_df['gdp'] = pd.to_numeric(gdp_df['gdp'], errors='coerce')
print(gdp_df.head())

# features data frame (merge medals and gdp dataframes)
features_df = pd.merge(medals_df, gdp_df, on='country_name', how='left')
print(features_df.head())

  gdp_df = gdp_df.replace({'\$': '', ',': ''}, regex=True)


    country_name             gdp  population
0  United States  27720700000000   343477335
1          China  17794800000000  1422584933
2        Germany   4525700000000    84548231
3          Japan   4204490000000   124370947
4          India   3567550000000  1438069596
  country_name  total_medals  total_gold_medals  total_silver_medals  \
0  Afghanistan             2                  0                    0   
1      Algeria            17                  5                    4   
2    Argentina            77                 21                   26   
3      Armenia            18                  2                    8   
4  Australasia            12                  3                    4   

   total_bronze_medals  total_summer_medals  total_winter_medals  \
0                    2                    2                    0   
1                    8                   17                    0   
2                   30                   77                    0   
3                    8   

  gdp_df = pd.read_html(str(table))[0]


In [100]:
# print head of cleaned data
print(data_clean.head())
# print head of features data
print(features_df.head())

  discipline_title    event_title medal_type rank_position   country_name  \
0          Curling  Mixed Doubles       GOLD             1          Italy   
1          Curling  Mixed Doubles     SILVER             2         Norway   
2          Curling  Mixed Doubles     BRONZE             3         Sweden   
3          Curling  Mixed Doubles        NaN             4  Great Britain   
4          Curling  Mixed Doubles        NaN             5         Canada   

  country_3_letter_code     city  year  season gender event_type  
0                   ITA  beijing  2022  winter  mixed    doubles  
1                   NOR  beijing  2022  winter  mixed    doubles  
2                   SWE  beijing  2022  winter  mixed    doubles  
3                   GBR  beijing  2022  winter  mixed    doubles  
4                   CAN  beijing  2022  winter  mixed    doubles  
  country_name  total_medals  total_gold_medals  total_silver_medals  \
0  Afghanistan             2                  0                

In [105]:
# fill in missing values (using median -> not perfect but will do for this)
imputer = SimpleImputer(strategy='median')
features_df[['gdp', 'population']] = imputer.fit_transform(features_df[['gdp', 'population']])
# add per values
features_df['gdp_per_capita'] = features_df['gdp'] * 1e6 / features_df['population']  # USD per capita
features_df['medals_per_million'] = features_df['total_medals'] / features_df['population']
# prepare features and target variable
X = features_df[['gdp', 'population', 'gdp_per_capita']]
y_summer = features_df['total_summer_medals']
y_winter = features_df['total_winter_medals']
# processing & summer/winter pipelines
numeric_features = ['gdp', 'population', 'gdp_per_capita']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])
# summer
summer_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
summer_lr.fit(X, y_summer)
# winter
winter_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
winter_lr.fit(X, y_winter)
# logistic for binary effects (medal probability)
# binary target (1 if country won > median medals)
median_summer = features_df['total_summer_medals'].median()
median_winter = features_df['total_winter_medals'].median()
y_summer_class = (features_df['total_summer_medals'] > median_summer).astype(int)
y_winter_class = (features_df['total_winter_medals'] > median_winter).astype(int)
# summer classifier
summer_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])
summer_logreg.fit(X, y_summer_class)
# winter classifier
winter_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])
winter_logreg.fit(X, y_winter_class)
# evaluate
X_train, X_test, y_train, y_test = train_test_split(X, y_summer, test_size=0.2, random_state=42)
# linear model
summer_pred = summer_lr.predict(X_test)
print(f"Summer MAE: {mean_absolute_error(y_test, summer_pred):.2f} medals")
# logistic model
y_pred_class = summer_logreg.predict(X_test)
print(f"Summer Classification Accuracy: {accuracy_score(y_test > median_summer, y_pred_class):.2%}")

# prediction function
def predict_medals(country_name):
    # check if the country exists in the 'country_name' column
    if country_name not in features_df['country_name'].values:
        raise ValueError(f"Country '{country_name}' not found in features_df.")

    # filter to get information for selected country
    country_data = features_df[features_df['country_name'] == country_name].iloc[0]
    gdp = country_data['gdp']
    population = country_data['population']
    gdp_per_capita = gdp * 1e6 / population

    # put information into input
    input_data = pd.DataFrame([{
        'gdp': gdp,
        'population': population,
        'gdp_per_capita': gdp_per_capita
    }])

    # predict summer and winter medals
    summer_medals = round(summer_lr.predict(input_data)[0])
    winter_medals = round(winter_lr.predict(input_data)[0])

    # predict probabilities for being in the top 50% for summer and winter
    summer_prob = summer_logreg.predict_proba(input_data)[0][1]
    winter_prob = winter_logreg.predict_proba(input_data)[0][1]

    # return the predictions and probabilities
    return {
        'country': country_name,
        'summer_medals_pred': summer_medals,
        'winter_medals_pred': winter_medals,
        'summer_top50_prob': f"{summer_prob:.1%}",
        'winter_top50_prob': f"{winter_prob:.1%}"
    }

Summer MAE: 172.22 medals
Summer Classification Accuracy: 68.75%
