In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Importing the dataset and deriving the dependant and independant variables

In [2]:
file = '../ETL/model_dataset_cleaned.csv'
olympic_df = pd.read_csv(file)
olympic_df

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals,placed
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,4.641827e+07,25831.582305,2069-10-17,46,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,5.061704e+07,27221.524051,1986-09-23,29,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,3.585177e+07,43248.529909,1992-05-27,24,1,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3.554150e+06,1848.061804,1991-01-02,25,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4.595700e+06,37807.967276,1990-11-26,25,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10104,801825063,Zurabi Iakobishvili,GEO,male,2/4/92,1.71,68.0,wrestling,0,0,0,3.679000e+06,3795.973308,1992-02-04,24,0,0
10105,214461847,Zuzana Hejnova,CZE,female,12/19/86,1.73,63.0,athletics,0,0,0,1.055122e+07,17548.338213,1986-12-19,29,0,0
10106,88361042,di Xiao,CHN,male,5/14/91,1.85,100.0,wrestling,0,0,0,1.371220e+09,8027.683810,1991-05-14,25,0,0
10107,900065925,le Quoc Toan Tran,VIE,male,4/5/89,1.60,56.0,weightlifting,0,0,0,9.170380e+07,2111.138024,1989-04-05,27,0,0


In [3]:
olympic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10109 entries, 0 to 10108
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10109 non-null  int64  
 1   name            10109 non-null  object 
 2   nationality     10109 non-null  object 
 3   sex             10109 non-null  object 
 4   dob             10109 non-null  object 
 5   height          10109 non-null  float64
 6   weight          10109 non-null  float64
 7   sport           10109 non-null  object 
 8   gold            10109 non-null  int64  
 9   silver          10109 non-null  int64  
 10  bronze          10109 non-null  int64  
 11  population      10109 non-null  float64
 12  gdp_per_capita  10109 non-null  float64
 13  dob_converted   10109 non-null  object 
 14  age             10109 non-null  int64  
 15  total_medals    10109 non-null  int64  
 16  placed          10109 non-null  int64  
dtypes: float64(4), int64(7), object

In [4]:
#dropping descriptive columns as well as the dependant variables (gold, silver, bronze, total medals, placed)
X = olympic_df.drop(columns=["id", "name", "gold","dob", "dob_converted", "bronze","silver","population","total_medals","placed"])

#create dummy columns for sport and sex 
obj_columns = ["sport", "sex", "nationality"]
X = pd.get_dummies(X, obj_columns)

X.head()

Unnamed: 0,height,weight,gdp_per_capita,age,sport_AFG,sport_ALB,sport_ALG,sport_ANG,sport_ANT,sport_ARG,...,nationality_rugby sevens,nationality_sailing,nationality_shooting,nationality_table tennis,nationality_taekwondo,nationality_tennis,nationality_triathlon,nationality_volleyball,nationality_weightlifting,nationality_wrestling
0,1.72,64.0,25831.582305,46,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,56.0,27221.524051,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,79.0,43248.529909,24,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,80.0,1848.061804,25,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,71.0,37807.967276,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Here we are using the binary outcome "placed" as the depedant variable
y=olympic_df["placed"]

In [6]:
y.value_counts()

0    8458
1    1651
Name: placed, dtype: int64

In [7]:
y

0        0
1        0
2        1
3        0
4        0
        ..
10104    0
10105    0
10106    0
10107    0
10108    0
Name: placed, Length: 10109, dtype: int64

### Splitting the dataset using train_test_split 

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Logistic Regression using Naive Random Oversampling

In [9]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6371, 1: 6371})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X, y)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49976042165788215

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[2086,    1],
       [ 441,    0]], dtype=int64)

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      1.00      0.00      0.90      0.00      0.00      2087
          1       0.00      0.00      1.00      0.00      0.00      0.00       441

avg / total       0.68      0.83      0.17      0.75      0.00      0.00      2528



0.0


## SMOTE Oversampling

In [14]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({0: 6358, 1: 6358})

In [15]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5950178015131287

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1243,  857],
       [ 172,  256]], dtype=int64)

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.59      0.60      0.71      0.60      0.35      2100
          1       0.23      0.60      0.59      0.33      0.60      0.35       428

avg / total       0.77      0.59      0.60      0.64      0.60      0.35      2528



## Undersampling

In [19]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
ccus = ClusterCentroids(random_state=1)
X_resampled, y_resampled = ccus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1223, 1: 1223})

In [20]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=1)

In [21]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5746395193591456

In [22]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 549, 1551],
       [  48,  380]], dtype=int64)

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.26      0.89      0.41      0.48      0.22      2100
          1       0.20      0.89      0.26      0.32      0.48      0.25       428

avg / total       0.80      0.37      0.78      0.39      0.48      0.22      2528



## Combination (Over and Under) Sampling (SMOTEENN)

In [24]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 4653, 1: 6709})

In [25]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.638235425011126

In [27]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1189,  911],
       [ 124,  304]], dtype=int64)

In [28]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.57      0.71      0.70      0.63      0.40      2100
          1       0.25      0.71      0.57      0.37      0.63      0.41       428

avg / total       0.79      0.59      0.69      0.64      0.63      0.40      2528



# Random Forest 
Again using "placed" as the dependant variable

In [29]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
clf = BalancedRandomForestClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [30]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7910057854917668

In [31]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1654,  446],
       [  88,  340]], dtype=int64)

In [32]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.79      0.79      0.86      0.79      0.63      2100
          1       0.43      0.79      0.79      0.56      0.79      0.63       428

avg / total       0.86      0.79      0.79      0.81      0.79      0.63      2528



In [33]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X_test.columns), reverse=True)

[(0.1333135673076521, 'weight'),
 (0.12798642652324266, 'height'),
 (0.11522727293195008, 'age'),
 (0.10118383104284133, 'gdp_per_capita'),
 (0.026931416396363855, 'nationality_athletics'),
 (0.02004035078257427, 'sport_USA'),
 (0.0167855724016186, 'nationality_aquatics'),
 (0.016652477809768598, 'sex_male'),
 (0.015604812677893164, 'sex_female'),
 (0.014616620146561137, 'sport_RUS'),
 (0.013806141352996465, 'nationality_hockey'),
 (0.013539344552131495, 'sport_GER'),
 (0.012614322451619382, 'sport_GBR'),
 (0.012118809923120895, 'nationality_football'),
 (0.011158979213657806, 'nationality_rugby sevens'),
 (0.010604847314605141, 'nationality_cycling'),
 (0.009984676215640437, 'nationality_rowing'),
 (0.009646056942808515, 'sport_BRA'),
 (0.008967865912515843, 'nationality_volleyball'),
 (0.008954184713661559, 'nationality_canoe'),
 (0.008846759705192773, 'sport_CAN'),
 (0.00872409483431086, 'nationality_basketball'),
 (0.008710707550997433, 'nationality_sailing'),
 (0.00865056724339157

## Random Forest using Easy Ensemble Classifier 

In [34]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
clf = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [35]:
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6723653760569649

In [36]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6723653760569649

In [37]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1298,  802],
       [ 117,  311]], dtype=int64)

In [38]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.62      0.73      0.74      0.67      0.44      2100
          1       0.28      0.73      0.62      0.40      0.67      0.45       428

avg / total       0.81      0.64      0.71      0.68      0.67      0.45      2528

