In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Importing the dataset and deriving the independant variables

In [2]:
file = '../ETL/model_dataset_cleaned.csv'
olympic_df = pd.read_csv(file)
olympic_df

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals,placed
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,4.641827e+07,25831.582305,2069-10-17,46,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,5.061704e+07,27221.524051,1986-09-23,29,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,3.585177e+07,43248.529909,1992-05-27,24,1,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3.554150e+06,1848.061804,1991-01-02,25,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4.595700e+06,37807.967276,1990-11-26,25,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10104,801825063,Zurabi Iakobishvili,GEO,male,2/4/92,1.71,68.0,wrestling,0,0,0,3.679000e+06,3795.973308,1992-02-04,24,0,0
10105,214461847,Zuzana Hejnova,CZE,female,12/19/86,1.73,63.0,athletics,0,0,0,1.055122e+07,17548.338213,1986-12-19,29,0,0
10106,88361042,di Xiao,CHN,male,5/14/91,1.85,100.0,wrestling,0,0,0,1.371220e+09,8027.683810,1991-05-14,25,0,0
10107,900065925,le Quoc Toan Tran,VIE,male,4/5/89,1.60,56.0,weightlifting,0,0,0,9.170380e+07,2111.138024,1989-04-05,27,0,0


In [3]:
olympic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10109 entries, 0 to 10108
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10109 non-null  int64  
 1   name            10109 non-null  object 
 2   nationality     10109 non-null  object 
 3   sex             10109 non-null  object 
 4   dob             10109 non-null  object 
 5   height          10109 non-null  float64
 6   weight          10109 non-null  float64
 7   sport           10109 non-null  object 
 8   gold            10109 non-null  int64  
 9   silver          10109 non-null  int64  
 10  bronze          10109 non-null  int64  
 11  population      10109 non-null  float64
 12  gdp_per_capita  10109 non-null  float64
 13  dob_converted   10109 non-null  object 
 14  age             10109 non-null  int64  
 15  total_medals    10109 non-null  int64  
 16  placed          10109 non-null  int64  
dtypes: float64(4), int64(7), object

In [4]:
#dropping descriptive columns as well as the dependant variables (gold, silver, bronze, total medals, placed)
X = olympic_df.drop(columns=["id", "name", "gold","dob", "dob_converted", "bronze","silver","population","total_medals","nationality","placed"])

#create dummy columns for sport and sex 
obj_columns = ["sport", "sex"]
X = pd.get_dummies(X, obj_columns)

X.head()

Unnamed: 0,height,weight,gdp_per_capita,age,sport_female,sport_male,sex_aquatics,sex_archery,sex_athletics,sex_badminton,...,sex_rugby sevens,sex_sailing,sex_shooting,sex_table tennis,sex_taekwondo,sex_tennis,sex_triathlon,sex_volleyball,sex_weightlifting,sex_wrestling
0,1.72,64.0,25831.582305,46,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,56.0,27221.524051,29,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,79.0,43248.529909,24,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,80.0,1848.061804,25,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,71.0,37807.967276,25,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Logistic Regression 
Here we are using the binary outcome "placed" as the depedant variable

In [5]:
y=olympic_df["placed"]

In [6]:
y.value_counts()

0    8458
1    1651
Name: placed, dtype: int64

In [7]:
y

0        0
1        0
2        1
3        0
4        0
        ..
10104    0
10105    0
10106    0
10107    0
10108    0
Name: placed, Length: 10109, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6356, 1: 6356})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X, y)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[2102,    0],
       [ 426,    0]], dtype=int64)

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      1.00      0.00      0.91      0.00      0.00      2102
          1       0.00      0.00      1.00      0.00      0.00      0.00       426

avg / total       0.69      0.83      0.17      0.75      0.00      0.00      2528



  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest 
Again using "placed" as the dependant variable

In [14]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
clf = BalancedRandomForestClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [15]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7401613933521842

In [16]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1508,  594],
       [ 101,  325]], dtype=int64)

In [17]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.72      0.76      0.81      0.74      0.54      2102
          1       0.35      0.76      0.72      0.48      0.74      0.55       426

avg / total       0.84      0.73      0.76      0.76      0.74      0.55      2528



In [18]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X_test.columns), reverse=True)

[(0.25926809178516674, 'gdp_per_capita'),
 (0.17624501360183786, 'weight'),
 (0.17112080311573294, 'height'),
 (0.15256599861811723, 'age'),
 (0.026321624764222954, 'sex_athletics'),
 (0.014459933293034181, 'sex_aquatics'),
 (0.012548046470278993, 'sport_male'),
 (0.01191703664483682, 'sport_female'),
 (0.01045621105555073, 'sex_football'),
 (0.01043333015123872, 'sex_hockey'),
 (0.009502030437826894, 'sex_canoe'),
 (0.009158614859930641, 'sex_rowing'),
 (0.008747911909690084, 'sex_wrestling'),
 (0.00866699186260709, 'sex_rugby sevens'),
 (0.008436786254511452, 'sex_fencing'),
 (0.008099555224921026, 'sex_cycling'),
 (0.008075434974857113, 'sex_handball'),
 (0.008040314574141661, 'sex_judo'),
 (0.007910904813335416, 'sex_sailing'),
 (0.007649703369666625, 'sex_basketball'),
 (0.007450527304621821, 'sex_gymnastics'),
 (0.00690839130926029, 'sex_volleyball'),
 (0.006797528378527948, 'sex_shooting'),
 (0.006444533170040916, 'sex_badminton'),
 (0.006406822634946589, 'sex_taekwondo'),
 (0.0

In [19]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
clf = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [20]:
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6581715156144606

In [21]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6581715156144606

In [22]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1262,  840],
       [ 121,  305]], dtype=int64)

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.60      0.72      0.72      0.66      0.42      2102
          1       0.27      0.72      0.60      0.39      0.66      0.43       426

avg / total       0.80      0.62      0.70      0.67      0.66      0.43      2528

