In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Importing the dataset and deriving the dependant and independant variables

In [2]:
file = '../ETL/model_dataset_cleaned.csv'
olympic_df = pd.read_csv(file)
olympic_df

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals,placed
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,4.641827e+07,25831.582305,2069-10-17,46,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,5.061704e+07,27221.524051,1986-09-23,29,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,3.585177e+07,43248.529909,1992-05-27,24,1,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3.554150e+06,1848.061804,1991-01-02,25,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4.595700e+06,37807.967276,1990-11-26,25,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10104,801825063,Zurabi Iakobishvili,GEO,male,2/4/92,1.71,68.0,wrestling,0,0,0,3.679000e+06,3795.973308,1992-02-04,24,0,0
10105,214461847,Zuzana Hejnova,CZE,female,12/19/86,1.73,63.0,athletics,0,0,0,1.055122e+07,17548.338213,1986-12-19,29,0,0
10106,88361042,di Xiao,CHN,male,5/14/91,1.85,100.0,wrestling,0,0,0,1.371220e+09,8027.683810,1991-05-14,25,0,0
10107,900065925,le Quoc Toan Tran,VIE,male,4/5/89,1.60,56.0,weightlifting,0,0,0,9.170380e+07,2111.138024,1989-04-05,27,0,0


In [3]:
olympic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10109 entries, 0 to 10108
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10109 non-null  int64  
 1   name            10109 non-null  object 
 2   nationality     10109 non-null  object 
 3   sex             10109 non-null  object 
 4   dob             10109 non-null  object 
 5   height          10109 non-null  float64
 6   weight          10109 non-null  float64
 7   sport           10109 non-null  object 
 8   gold            10109 non-null  int64  
 9   silver          10109 non-null  int64  
 10  bronze          10109 non-null  int64  
 11  population      10109 non-null  float64
 12  gdp_per_capita  10109 non-null  float64
 13  dob_converted   10109 non-null  object 
 14  age             10109 non-null  int64  
 15  total_medals    10109 non-null  int64  
 16  placed          10109 non-null  int64  
dtypes: float64(4), int64(7), object

In [4]:
#dropping descriptive columns as well as the dependant variables (gold, silver, bronze, total medals, placed)
X = olympic_df.drop(columns=["id", "name", "gold","dob", "dob_converted", "bronze","silver","population","total_medals","nationality","placed"])

#create dummy columns for sport and sex 
obj_columns = ["sport", "sex"]
X = pd.get_dummies(X, obj_columns)

X.head()

Unnamed: 0,height,weight,gdp_per_capita,age,sport_female,sport_male,sex_aquatics,sex_archery,sex_athletics,sex_badminton,...,sex_rugby sevens,sex_sailing,sex_shooting,sex_table tennis,sex_taekwondo,sex_tennis,sex_triathlon,sex_volleyball,sex_weightlifting,sex_wrestling
0,1.72,64.0,25831.582305,46,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,56.0,27221.524051,29,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,79.0,43248.529909,24,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,80.0,1848.061804,25,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,71.0,37807.967276,25,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Here we are using the binary outcome "placed" as the depedant variable
y=olympic_df["placed"]

In [6]:
y.value_counts()

0    8458
1    1651
Name: placed, dtype: int64

### Splitting the dataset using train_test_split 

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Logistic Regression using Naive Random Oversampling

In [8]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 6330, 1: 6330})

In [9]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X, y)

LogisticRegression(random_state=1)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5

In [11]:
#initialize a list to capture all of the Balanced Accuracy Scores for summary table
BAS_list = []
BAS_list.append(balanced_accuracy_score(y_test, y_pred))

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[2128,    0],
       [ 400,    0]], dtype=int64)

In [13]:
#Initialize a list of confusion matrices for summary table
CM_list=[]
CM_list.append(confusion_matrix(y_test, y_pred))

In [14]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      1.00      0.00      0.91      0.00      0.00      2128
          1       0.00      0.00      1.00      0.00      0.00      0.00       400

avg / total       0.71      0.84      0.16      0.77      0.00      0.00      2528



  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
summary_df = pd.DataFrame(columns=['pre', 'rec', 'spe', 'f1', 'geo', 'iba', 'sup'])
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup


In [16]:
#Add classification report to the summary table 
name = 'Naive Random Oversampling'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)

summary_df

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0


## SMOTE Oversampling

In [17]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({0: 6330, 1: 6330})

In [18]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BAS_list.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.5996616541353383

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1286,  842],
       [ 162,  238]], dtype=int64)

In [21]:
#add confusion matrix to CM list
CM_list.append(confusion_matrix(y_test, y_pred))

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.60      0.59      0.72      0.60      0.36      2128
          1       0.22      0.59      0.60      0.32      0.60      0.36       400

avg / total       0.78      0.60      0.60      0.66      0.60      0.36      2528



In [23]:
#Add classification report to summary table 
name = 'SMOTE'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0


## Undersampling

In [24]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
ccus = ClusterCentroids(random_state=1)
X_resampled, y_resampled = ccus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1251, 1: 1251})

In [25]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BAS_list.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.5828947368421052

In [27]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1204,  924],
       [ 160,  240]], dtype=int64)

In [28]:
#add confusion matrix to CM list
CM_list.append(confusion_matrix(y_test, y_pred))

In [29]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.57      0.60      0.69      0.58      0.34      2128
          1       0.21      0.60      0.57      0.31      0.58      0.34       400

avg / total       0.78      0.57      0.59      0.63      0.58      0.34      2528



In [30]:
#Add classification report to summary table 
name = 'Undersampling with Cluster Centroids'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0
Undersampling with Cluster Centroids,0.21,0.6,0.57,0.31,0.58,0.34,400.0


## Combination (Over and Under) Sampling (SMOTEENN)

In [31]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 4657, 1: 6713})

In [32]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
BAS_list.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.5593609022556391

In [34]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 907, 1221],
       [ 123,  277]], dtype=int64)

In [35]:
#add confusion matrix to CM list
CM_list.append(confusion_matrix(y_test, y_pred))

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.43      0.69      0.57      0.54      0.29      2128
          1       0.18      0.69      0.43      0.29      0.54      0.30       400

avg / total       0.77      0.47      0.65      0.53      0.54      0.29      2528



In [37]:
#Add classification report to summary table 
name = 'SMOTEENN'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0
Undersampling with Cluster Centroids,0.21,0.6,0.57,0.31,0.58,0.34,400.0
SMOTEENN,0.18,0.69,0.43,0.29,0.54,0.3,400.0


# Random Forest 
Again using "placed" as the dependant variable

In [38]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
clf = BalancedRandomForestClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [39]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
BAS_list.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.7270770676691729

In [40]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1541,  587],
       [ 108,  292]], dtype=int64)

In [41]:
#Add confusion matrix to CM list
CM_list.append(confusion_matrix(y_test, y_pred))

In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.72      0.73      0.82      0.73      0.53      2128
          1       0.33      0.73      0.72      0.46      0.73      0.53       400

avg / total       0.84      0.73      0.73      0.76      0.73      0.53      2528



In [43]:
# List the features sorted in descending order by feature importance
features_df = pd.DataFrame(sorted(zip(clf.feature_importances_, X_test.columns), reverse=True))

features_df.rename(columns={0: "Feature Importance", 1: "Feature"},inplace=True)

features_df

Unnamed: 0,Feature Importance,Feature
0,0.263138,gdp_per_capita
1,0.172349,weight
2,0.167969,height
3,0.156555,age
4,0.027062,sex_athletics
5,0.014377,sex_aquatics
6,0.011845,sport_female
7,0.011758,sex_hockey
8,0.011218,sport_male
9,0.010396,sex_football


In [44]:
#Add classification report to summary table 
name = 'Random Forest'
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0
Undersampling with Cluster Centroids,0.21,0.6,0.57,0.31,0.58,0.34,400.0
SMOTEENN,0.18,0.69,0.43,0.29,0.54,0.3,400.0
Random Forest,0.33,0.73,0.72,0.46,0.73,0.53,400.0


## Random Forest using Easy Ensemble Classifier 

In [45]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
clf = EasyEnsembleClassifier(n_estimators=100, random_state=1).fit(X_train,y_train)

In [46]:
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6431015037593986

In [47]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test)
BAS_list.append(balanced_accuracy_score(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

0.6431015037593986

In [48]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1290,  838],
       [ 128,  272]], dtype=int64)

In [49]:
#Add confusion matrix to CM list
CM_list.append(confusion_matrix(y_test, y_pred))

In [50]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.61      0.68      0.73      0.64      0.41      2128
          1       0.25      0.68      0.61      0.36      0.64      0.42       400

avg / total       0.80      0.62      0.67      0.67      0.64      0.41      2528



In [51]:
#Add classification report to summary table 
name = 'Random Forest w/ Easy Ensemble Classifier '
class_dict= classification_report_imbalanced(y_test, y_pred,output_dict=True)
class_df= pd.DataFrame(class_dict).round(decimals=2)
summary_df = summary_df.append(class_df[1])
summary_df.rename(index={1:name},inplace=True)
summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0
Undersampling with Cluster Centroids,0.21,0.6,0.57,0.31,0.58,0.34,400.0
SMOTEENN,0.18,0.69,0.43,0.29,0.54,0.3,400.0
Random Forest,0.33,0.73,0.72,0.46,0.73,0.53,400.0
Random Forest w/ Easy Ensemble Classifier,0.25,0.68,0.61,0.36,0.64,0.42,400.0


## Summary Table 

In [52]:
#Add BAS and CM lists to the summary Dataframe
summary_df["BAS"] = BAS_list
summary_df["BAS"] = summary_df["BAS"].round(decimals=2)
summary_df["Confusion Matrix"] = CM_list

summary_df

Unnamed: 0,pre,rec,spe,f1,geo,iba,sup,BAS,Confusion Matrix
Naive Random Oversampling,0.0,0.0,1.0,0.0,0.0,0.0,400.0,0.5,"[[2128, 0], [400, 0]]"
SMOTE,0.22,0.6,0.6,0.32,0.6,0.36,400.0,0.6,"[[1286, 842], [162, 238]]"
Undersampling with Cluster Centroids,0.21,0.6,0.57,0.31,0.58,0.34,400.0,0.58,"[[1204, 924], [160, 240]]"
SMOTEENN,0.18,0.69,0.43,0.29,0.54,0.3,400.0,0.56,"[[907, 1221], [123, 277]]"
Random Forest,0.33,0.73,0.72,0.46,0.73,0.53,400.0,0.73,"[[1541, 587], [108, 292]]"
Random Forest w/ Easy Ensemble Classifier,0.25,0.68,0.61,0.36,0.64,0.42,400.0,0.64,"[[1290, 838], [128, 272]]"


In [53]:
#drop unneccessary columns
summary_df.drop(columns=["geo","iba","sup"],inplace=True)

#reorder the columns of the Summary df 
cols = summary_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
summary_df = summary_df[cols]

summary_df

Unnamed: 0,BAS,Confusion Matrix,pre,rec,spe,f1
Naive Random Oversampling,0.5,"[[2128, 0], [400, 0]]",0.0,0.0,1.0,0.0
SMOTE,0.6,"[[1286, 842], [162, 238]]",0.22,0.6,0.6,0.32
Undersampling with Cluster Centroids,0.58,"[[1204, 924], [160, 240]]",0.21,0.6,0.57,0.31
SMOTEENN,0.56,"[[907, 1221], [123, 277]]",0.18,0.69,0.43,0.29
Random Forest,0.73,"[[1541, 587], [108, 292]]",0.33,0.73,0.72,0.46
Random Forest w/ Easy Ensemble Classifier,0.64,"[[1290, 838], [128, 272]]",0.25,0.68,0.61,0.36


In [54]:
#export csvs
summary_df.to_csv("summary.csv")
features_df.to_csv("features.csv")