In [307]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [254]:
athletes = pd.read_csv('data/summerOly_athletes.csv')
hosts = pd.read_csv('data/summerOly_hosts.csv')
medal_counts = pd.read_csv('data/summerOly_medal_counts.csv', encoding='utf-8')
medal_counts = medal_counts.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
programs = pd.read_csv('data/summerOly_programs.csv', encoding='utf-8')
full_medal = pd.read_csv('data/full_medal_data.csv')

In [5]:
year_to_country_code = {
    1896: "GRE",  # Athens, Greece
    1900: "FRA",  # Paris, France
    1904: "USA",  # St. Louis, USA
    1908: "GBR",  # London, United Kingdom
    1912: "SWE",  # Stockholm, Sweden
    1916: None,   # Cancelled
    1920: "BEL",  # Antwerp, Belgium
    1924: "FRA",  # Paris, France
    1928: "NED",  # Amsterdam, Netherlands
    1932: "USA",  # Los Angeles, USA
    1936: "GER",  # Berlin, Germany
    1940: None,   # Cancelled
    1944: None,   # Cancelled
    1948: "GBR",  # London, United Kingdom
    1952: "FIN",  # Helsinki, Finland
    1956: "AUS",  # Melbourne, Australia
    1960: "ITA",  # Rome, Italy
    1964: "JPN",  # Tokyo, Japan
    1968: "MEX",  # Mexico City, Mexico
    1972: "GER",  # Munich, Germany
    1976: "CAN",  # Montreal, Canada
    1980: "URS",  # Moscow, Soviet Union
    1984: "USA",  # Los Angeles, USA
    1988: "KOR",  # Seoul, South Korea
    1992: "ESP",  # Barcelona, Spain
    1996: "USA",  # Atlanta, USA
    2000: "AUS",  # Sydney, Australia
    2004: "GRE",  # Athens, Greece
    2008: "CHN",  # Beijing, China
    2012: "GBR",  # London, United Kingdom
    2016: "BRA",  # Rio de Janeiro, Brazil
    2020: "JPN",  # Tokyo, Japan
    2024: "FRA",  # Paris, France
    2028: "USA",  # Los Angeles, USA
    2032: "AUS",  # Brisbane, Australia
}

In [6]:
hosts['NOC'] = hosts['Year'].map(year_to_country_code)

In [257]:
NOC = medal_counts['NOC'].str.replace('\xa0','').unique()

In [258]:
medal_counts_LR = medal_counts

medal_counts_LR = medal_counts_LR.sort_values(by=['NOC', 'Year']).reset_index(drop=True)

medal_counts_LR['Predicted_Medals'] = np.nan

for index, row in medal_counts_LR.iterrows():
    country = row['NOC']
    year = row['Year']
    #print(f"Processing: Country={country}, Year={year}")
    
    # Filter previous years' data for the same country
    historical_data = medal_counts_LR[(medal_counts_LR['NOC'] == country) & (medal_counts_LR['Year'] < year)]
    #print(historical_data)
    
    # Check if there is enough data to train a model
    if len(historical_data) >= 2:  # At least 2 data points needed for Linear Regression
        # Prepare training data
        X_train = historical_data[['Year']].values
        y_train = historical_data['Total'].values
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict the medal count for the current year
        X_test = np.array([[year]])  # Current year as input
        predicted_medals = model.predict(X_test)[0]
        
        # Store the prediction in the DataFrame
        medal_counts_LR.at[index, 'Predicted_Medals'] = predicted_medals

In [255]:
# Display the updated DataFrame
medal_counts_LR.iloc[1:10]

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total,Year,Predicted_Medals
1,79,Afghanistan,0,0,1,1,2012,
2,80,Albania,0,0,2,2,2024,
3,42,Algeria,0,0,2,2,1984,
4,34,Algeria,1,0,1,2,1992,
5,34,Algeria,2,0,1,3,1996,2.0
6,42,Algeria,1,1,3,5,2000,3.0
7,68,Algeria,0,1,1,2,2008,5.571429
8,50,Algeria,1,0,0,1,2012,3.4
9,63,Algeria,0,2,0,2,2016,2.11


In [263]:
full_medal.iloc[1:10]

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total,Year,Country Code,Is_Host,Ex-Host,Host Continent,Team Size,Participated Events,Total Events
1,2,Greece,10,18,19,47,1896,GRE,1,1,1,148.0,39,43.0
2,3,Germany,6,5,2,13,1896,GER,0,0,1,94.0,27,43.0
3,4,France,5,4,2,11,1896,FRA,0,0,1,26.0,18,43.0
4,5,Great Britain,2,3,2,7,1896,GBR,0,0,0,25.0,19,43.0
5,6,Hungary,2,1,3,6,1896,HUN,0,0,1,18.0,14,43.0
6,7,Austria,2,1,2,5,1896,AUT,0,0,1,8.0,8,43.0
7,8,Australia,2,0,0,2,1896,AUS,0,0,0,5.0,5,43.0
8,9,Denmark,1,2,3,6,1896,DEN,0,0,1,15.0,12,43.0
9,10,Switzerland,1,2,0,3,1896,SUI,0,0,0,8.0,5,43.0


In [337]:
medal_counts_LR = full_medal.sort_values(by=['NOC', 'Year']).reset_index(drop=True)

medal_counts_LR['Predicted Medals'] = np.nan
medal_counts_LR['Predicted Proportion of Medals'] = np.nan

for index, row in medal_counts_LR.iterrows():
    country = row['NOC']
    year = row['Year']
    
    # Filter previous years' data for the same country
    historical_data = medal_counts_LR[(medal_counts_LR['NOC'] == country) & (medal_counts_LR['Year'] < year)]
    #print(historical_data)
    
    if len(historical_data) >= 2:
        X_train = historical_data[['Year']].values
        y_train = historical_data['Total'].values
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict the medal count for the current year
        X_test = np.array([[year]])  # Current year as input
        predicted_medals = model.predict(X_test)[0]
        
        # Store the prediction in the DataFrame
        medal_counts_LR.at[index, 'Predicted Medals'] = predicted_medals
        medal_counts_LR.at[index, 'Predicted Proportion of Medals'] = predicted_medals/(3* row['Total Events'])

In [338]:
# Write a thing that adds a column for relative prediction correctness
medal_counts_LR['Prediction Delta'] = np.nan
delta = abs(medal_counts_LR['Predicted Medals'] - medal_counts_LR['Total'])/medal_counts_LR['Total']
medal_counts_LR['Prediction Delta'] = delta

In [340]:
nonnan_medal_counts_LR = medal_counts_LR[medal_counts_LR['Predicted Medals'].notnull()]

r2 = r2_score(nonnan_medal_counts_LR['Total'].values, nonnan_medal_counts_LR['Predicted Medals'])
print(r2)

0.549999903944929


In [341]:
medal_counts_LR[medal_counts_LR['Year'] == 2020].sort_values(by = ['Prediction Delta'], ascending = False).iloc[:10]

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total,Year,Country Code,Is_Host,Ex-Host,Host Continent,Team Size,Participated Events,Total Events,Predicted Medals,Predicted Proportion of Medals,Prediction Delta
35,0,Afghanistan,0,0,0,0,2020,AFG,0,0,1,5.0,5,339.0,0.4434,0.000436,inf
60,0,Algeria,0,0,0,0,2020,ALG,0,0,0,45.0,43,339.0,2.587222,0.002544,inf
527,0,Cameroon,0,0,0,0,2020,CMR,0,0,0,12.0,12,339.0,0.725275,0.000713,inf
281,0,Barbados,0,0,0,0,2020,BAR,0,0,0,9.0,9,339.0,0.151911,0.000149,inf
3067,0,Uruguay,0,0,0,0,2020,URU,0,0,0,11.0,9,339.0,-0.150518,-0.000148,inf
3185,0,Zambia,0,0,0,0,2020,ZAM,0,0,0,31.0,10,339.0,0.135678,0.000133,inf
458,0,Burundi,0,0,0,0,2020,BDI,0,0,0,7.0,7,339.0,0.333333,0.000328,inf
816,0,Djibouti,0,0,0,0,2020,DJI,0,0,0,5.0,5,339.0,-0.125265,-0.000123,inf
753,0,Cyprus,0,0,0,0,2020,CYP,0,0,1,23.0,22,339.0,0.333333,0.000328,inf
709,0,Costa Rica,0,0,0,0,2020,CRC,0,0,0,18.0,17,339.0,0.469484,0.000462,inf


In [356]:
mc_MLR = full_medal.sort_values(by=['NOC', 'Year']).reset_index(drop=True)

mc_MLR['Predicted Medals'] = np.nan
mc_MLR['Predicted Proportion of Medals'] = np.nan

for index, row in mc_MLR.iterrows():
    country = row['NOC']
    year = row['Year']
    teamsize = row['Team Size']
    
    # Filter previous years' data for the same country
    historical_data = mc_MLR[(medal_counts_LR['NOC'] == country) & (medal_counts_LR['Year'] < year)]
    #print(historical_data)
    
    if len(historical_data) >= 2:
        X_train = historical_data[['Year', 'Team Size']]
        y_train = historical_data['Total'].values
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict the medal count for the current year
        X_test = np.array([[year, teamsize]])  # Current year as input
        predicted_medals = model.predict(X_test)[0]
        
        # Store the prediction in the DataFrame
        mc_MLR.at[index, 'Predicted Medals'] = predicted_medals
        mc_MLR.at[index, 'Predicted Proportion of Medals'] = predicted_medals/(3* row['Total Events'])



KeyboardInterrupt: 

In [358]:
mc_MLR[mc_MLR['Year'] == 2020]#.sort_values(by = ['Prediction Delta'], ascending = False).iloc[:10]

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total,Year,Country Code,Is_Host,Ex-Host,Host Continent,Team Size,Participated Events,Total Events,Predicted Medals,Predicted Proportion of Medals
9,0,ARU,0,0,0,0,2020,ARU,0,0,0,4.0,4,339.0,,
19,0,ASA,0,0,0,0,2020,ASA,0,0,0,6.0,5,339.0,,
35,0,Afghanistan,0,0,0,0,2020,AFG,0,0,1,5.0,5,339.0,,
45,0,Albania,0,0,0,0,2020,ALB,0,0,0,10.0,10,339.0,,
60,0,Algeria,0,0,0,0,2020,ALG,0,0,0,45.0,43,339.0,,
73,0,Andorra,0,0,0,0,2020,AND,0,0,0,3.0,3,339.0,,
84,0,Angola,0,0,0,0,2020,ANG,0,0,0,20.0,6,339.0,,
96,0,Antigua and Barbuda,0,0,0,0,2020,ANT,0,0,0,6.0,6,339.0,,
122,72,Argentina,0,1,2,3,2020,ARG,0,0,0,208.0,61,339.0,,
130,69,Armenia,0,2,2,4,2020,ARM,0,0,0,19.0,19,339.0,,
