In [2]:
# dependencies
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [11]:
# demographic regresssion

# load data
demographic_df = pd.read_csv('8-8-2021 demographic reg data.csv', index_col=0)
demographic_df.info()
demographic_df.head(15)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 718 entries, 0 to 717
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Gold                       718 non-null    int64  
 1   Silver                     718 non-null    int64  
 2   Bronze                     718 non-null    int64  
 3   Total                      718 non-null    int64  
 4   GDP                        718 non-null    float64
 5   Latitude Hemisphere        718 non-null    object 
 6   Population                 718 non-null    int64  
 7   Life Expectancy            718 non-null    float64
 8   Average Distance to Games  718 non-null    float64
 9   Total Competitors          718 non-null    int64  
 10  Total Medal Count          718 non-null    int64  
 11  Total Win Ratio            718 non-null    float64
 12  Total Male Competitors     718 non-null    int64  
 13  Male Wins                  718 non-null    int64  

Unnamed: 0,Gold,Silver,Bronze,Total,GDP,Latitude Hemisphere,Population,Life Expectancy,Average Distance to Games,Total Competitors,Total Medal Count,Total Win Ratio,Total Male Competitors,Male Wins,Male Medal Ratio,Total Female Competitors,Female Wins,Female Medal Ratio,Open Wins,Open Win Ratio
0,0,0,1,1,3.8568,South,15960445,47.47,6411.84284,3,1,0.333333,2,0,0.0,1,1,1.0,0,0.0
1,0,0,1,1,0.74278,North,4768225,36.203,5043.162906,4,1,0.25,4,1,0.25,0,0,0.0,0,0.0
2,3,0,0,1,5.656474,South,17711925,48.946,6411.84284,4,1,0.25,2,0,0.0,2,1,0.5,0,0.0
3,3,0,0,1,147.824,North,4068577,75.174,5084.586667,4,1,0.25,4,1,0.25,0,0,0.0,0,0.0
4,0,0,1,1,1.109054,North,2719809,57.932,5064.849686,4,1,0.25,3,1,0.333333,1,0,0.0,0,0.0
5,0,0,1,1,10.109226,North,27722281,59.93,5247.865726,4,1,0.25,3,1,0.333333,1,0,0.0,0,0.0
6,0,0,1,1,3.323677,North,6083417,56.178,5243.072001,4,1,0.25,3,1,0.333333,1,0,0.0,0,0.0
7,0,2,0,1,14.380004,South,2039551,63.511,6338.289724,4,1,0.25,3,1,0.333333,1,0,0.0,0,0.0
8,0,2,0,1,1.046191,North,6201410,45.509,5286.381803,5,1,0.2,5,1,0.2,0,0,0.0,0,0.0
9,0,2,0,1,0.222101,South,96267,69.41,11540.09517,5,1,0.2,4,1,0.25,1,0,0.0,0,0.0


In [12]:
# encode latitude
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

encode_df = pd.DataFrame(enc.fit_transform(demographic_df['Latitude Hemisphere'].values.reshape(-1,1)))

encode_df.columns = enc.get_feature_names(['Latitude_Hemisphere'])
encode_df.head(10)

Unnamed: 0,Latitude_Hemisphere_North,Latitude_Hemisphere_South
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0
6,1.0,0.0
7,0.0,1.0
8,1.0,0.0
9,0.0,1.0


In [13]:
# drop one of the encoded columns becuase binary data can cause colinearity issues
encode_df.drop(columns=['Latitude_Hemisphere_North'], inplace=True)

In [14]:
# copy df to process
demo_to_process = demographic_df.copy()

In [15]:
# replace hemisphere data with int 1 = south 0 = north
demo_to_process['Latitude Hemisphere'] = encode_df['Latitude_Hemisphere_South']
demo_to_process.head()

Unnamed: 0,Gold,Silver,Bronze,Total,GDP,Latitude Hemisphere,Population,Life Expectancy,Average Distance to Games,Total Competitors,Total Medal Count,Total Win Ratio,Total Male Competitors,Male Wins,Male Medal Ratio,Total Female Competitors,Female Wins,Female Medal Ratio,Open Wins,Open Win Ratio
0,0,0,1,1,3.8568,1.0,15960445,47.47,6411.84284,3,1,0.333333,2,0,0.0,1,1,1.0,0,0.0
1,0,0,1,1,0.74278,0.0,4768225,36.203,5043.162906,4,1,0.25,4,1,0.25,0,0,0.0,0,0.0
2,3,0,0,1,5.656474,1.0,17711925,48.946,6411.84284,4,1,0.25,2,0,0.0,2,1,0.5,0,0.0
3,3,0,0,1,147.824,0.0,4068577,75.174,5084.586667,4,1,0.25,4,1,0.25,0,0,0.0,0,0.0
4,0,0,1,1,1.109054,0.0,2719809,57.932,5064.849686,4,1,0.25,3,1,0.333333,1,0,0.0,0,0.0


In [16]:
# asjust population scale
demo_to_process['Population'] = demo_to_process['Population'] / 100000

In [19]:
# split into training and testing sets
X = demo_to_process.drop(columns=['Total']).values
y = demo_to_process['Total'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [20]:
# fit to multiple linear regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# predict test set results
y_pred = regressor.predict(X_test)

In [21]:
# send predictions to df
demo_predictions_df = pd.DataFrame({'Actual':y_test, 'Predicited':y_pred})

In [22]:
# check the results
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print(regressor.intercept_)
print(regressor.coef_)

Mean Absolute Error: 1.668063835206605e-13
Mean Squared Error: 1.2016697865650215e-25
Root Mean Squared Error: 3.4665109066105957e-13
-5.737632591262809e-13
[ 3.33333333e-01  5.00000000e-01  1.00000000e+00  1.93044944e-16
 -1.03100810e-13 -5.89339756e-17 -9.06749206e-16  1.30030766e-16
  7.14355868e-16 -3.17047907e-16 -5.85625723e-15 -9.50972162e-16
  5.55189958e-16  1.51296589e-14 -1.16271254e-15 -4.92293413e-16
  2.93912264e-15 -6.25491223e-16  3.09182050e-14]


In [23]:
# check data with statsmodel.api
Y = demo_to_process['Total']
X = demo_to_process.drop(columns=['Total'])
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.params

const                        2.498002e-15
Gold                         3.333333e-01
Silver                       5.000000e-01
Bronze                       1.000000e+00
GDP                          1.634604e-17
Latitude Hemisphere         -3.622658e-13
Population                   5.413642e-17
Life Expectancy              1.372166e-15
Average Distance to Games   -4.514347e-17
Total Competitors           -5.858649e-15
Total Medal Count           -1.977585e-16
Total Win Ratio              6.328271e-14
Total Male Competitors       6.269453e-15
Male Wins                   -5.273559e-16
Male Medal Ratio            -9.137135e-14
Total Female Competitors     6.420971e-15
Female Wins                 -9.298118e-16
Female Medal Ratio          -1.432188e-14
Open Wins                   -1.087672e-15
Open Win Ratio              -1.697531e-13
dtype: float64

In [24]:
results.summary()

0,1,2,3
Dep. Variable:,Total,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,2.7110000000000002e+29
Date:,"Sun, 08 Aug 2021",Prob (F-statistic):,0.0
Time:,23:35:28,Log-Likelihood:,19819.0
No. Observations:,718,AIC:,-39600.0
Df Residuals:,699,BIC:,-39510.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.498e-15,1.07e-13,0.023,0.981,-2.07e-13,2.12e-13
Gold,0.3333,1.2e-15,2.78e+14,0.000,0.333,0.333
Silver,0.5000,2.33e-15,2.15e+14,0.000,0.500,0.500
Bronze,1.0000,4.24e-15,2.36e+14,0.000,1.000,1.000
GDP,1.635e-17,9.09e-18,1.797,0.073,-1.51e-18,3.42e-17
Latitude Hemisphere,-3.623e-13,3.78e-14,-9.572,0.000,-4.37e-13,-2.88e-13
Population,5.414e-17,6.17e-18,8.771,0.000,4.2e-17,6.63e-17
Life Expectancy,1.372e-15,1.36e-15,1.011,0.312,-1.29e-15,4.04e-15
Average Distance to Games,-4.514e-17,1.06e-17,-4.271,0.000,-6.59e-17,-2.44e-17

0,1,2,3
Omnibus:,82.857,Durbin-Watson:,1.152
Prob(Omnibus):,0.0,Jarque-Bera (JB):,362.491
Skew:,0.428,Prob(JB):,1.9300000000000002e-79
Kurtosis:,6.374,Cond. No.,3.41e+17


In [25]:
# load results without demographic data
competition_data_df = pd.read_csv('8-8-2021 comp only regression.csv')
competition_data_df.info()
competition_data_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876 entries, 0 to 875
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Year                      876 non-null    int64  
 1   Country Code              876 non-null    object 
 2   Total Competitors         876 non-null    int64  
 3   Gold                      876 non-null    int64  
 4   Silver                    876 non-null    int64  
 5   Bronze                    876 non-null    int64  
 6   Total Medal Count         876 non-null    int64  
 7   Total Win Ratio           876 non-null    float64
 8   Total Male Competitors    876 non-null    int64  
 9   Male Wins                 876 non-null    int64  
 10  Male Medal Ratio          876 non-null    float64
 11  Total Female Competitors  876 non-null    int64  
 12  Female Wins               876 non-null    int64  
 13  Female Medal Ratio        876 non-null    float64
 14  Open Wins 

Unnamed: 0,Year,Country Code,Total Competitors,Gold,Silver,Bronze,Total Medal Count,Total Win Ratio,Total Male Competitors,Male Wins,Male Medal Ratio,Total Female Competitors,Female Wins,Female Medal Ratio,Open Wins,Open Win Ratio
0,1964,ARG,102,0,2,0,1,0.009804,96,0,0.0,6,0,0.0,1,0.009804
1,1964,BHS,11,3,0,0,1,0.090909,11,0,0.0,0,0,0.0,0,0.0
2,1964,AUS,243,18,4,10,18,0.074074,203,10,0.049261,40,7,0.175,1,0.004115
3,1964,BEL,61,6,0,1,3,0.04918,60,3,0.05,1,0,0.0,0,0.0
4,1964,BGR,63,9,10,2,10,0.15873,56,10,0.178571,7,0,0.0,0,0.0
5,1964,BRA,61,0,0,1,1,0.016393,60,1,0.016667,1,0,0.0,0,0.0
6,1964,CAN,115,3,4,1,4,0.034783,95,4,0.042105,20,0,0.0,0,0.0
7,1964,CHE,66,3,4,1,4,0.060606,65,2,0.030769,1,0,0.0,0,0.0
8,1964,CUB,27,0,2,0,1,0.037037,25,1,0.04,2,0,0.0,0,0.0
9,1964,CZE,104,15,12,3,14,0.134615,95,10,0.105263,9,4,0.444444,0,0.0


In [26]:
# drop object / time data
comp_data_to_process = competition_data_df.drop(columns=['Year', 'Country Code'])

In [28]:
# split into training and testing sets
X_comp = comp_data_to_process.drop(columns=['Total Medal Count']).values
y_comp = comp_data_to_process['Total Medal Count'].values

X_train, X_test, y_train, y_test = train_test_split(X_comp, y_comp, test_size=0.2, random_state=42)

In [29]:
# fit to multiple linear regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# predict test set results
y_pred = regressor.predict(X_test)

In [30]:
# send predictions to df
comp_predictions_df = pd.DataFrame({'Actual':y_test, 'Predicited':y_pred})

In [31]:
# check the results
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print(regressor.intercept_)
print(regressor.coef_)

Mean Absolute Error: 0.17503111203135557
Mean Squared Error: 0.19460227658747942
Root Mean Squared Error: 0.44113748037032563
0.031915316204180755
[-8.19408441e-02  1.69451518e-03  3.63748869e-03  1.12775924e-02
  1.25423208e+00  8.42907277e-02  9.86606363e-01 -8.57175091e-01
  7.91100384e-02  9.98224068e-01 -4.92654881e-01  9.92382287e-01
 -6.25075924e+00]


In [32]:
# check data with statsmodel.api
Y = comp_data_to_process['Total Medal Count']
X = comp_data_to_process.drop(columns=['Total Medal Count'])
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.params

const                       0.031568
Total Competitors          -0.084806
Gold                        0.002719
Silver                      0.004815
Bronze                      0.012549
Total Win Ratio             1.071018
Total Male Competitors      0.087275
Male Wins                   0.983552
Male Medal Ratio           -0.667236
Total Female Competitors    0.082015
Female Wins                 0.996495
Female Medal Ratio         -0.510410
Open Wins                   0.987309
Open Win Ratio             -6.089398
dtype: float64

In [33]:
results.summary()

0,1,2,3
Dep. Variable:,Total Medal Count,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,232800.0
Date:,"Sun, 08 Aug 2021",Prob (F-statistic):,0.0
Time:,23:42:04,Log-Likelihood:,-367.04
No. Observations:,876,AIC:,762.1
Df Residuals:,862,BIC:,828.9
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0316,0.024,1.309,0.191,-0.016,0.079
Total Competitors,-0.0848,0.124,-0.684,0.494,-0.328,0.159
Gold,0.0027,0.002,1.651,0.099,-0.001,0.006
Silver,0.0048,0.003,1.509,0.132,-0.001,0.011
Bronze,0.0125,0.006,2.154,0.032,0.001,0.024
Total Win Ratio,1.0710,0.303,3.533,0.000,0.476,1.666
Total Male Competitors,0.0873,0.124,0.704,0.482,-0.156,0.331
Male Wins,0.9836,0.005,215.238,0.000,0.975,0.993
Male Medal Ratio,-0.6672,0.245,-2.725,0.007,-1.148,-0.187

0,1,2,3
Omnibus:,893.433,Durbin-Watson:,2.196
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33281.04
Skew:,4.95,Prob(JB):,0.0
Kurtosis:,31.527,Cond. No.,42100.0
