In [66]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

## Importing data

In [67]:
df_4 = pd.read_csv('../data/final_data.csv', index_col=[0])
df_4.head()

Unnamed: 0,id,age,year,sex_F,sex_M,noc_AFG,noc_AHO,noc_ALB,noc_ALG,noc_AND,...,"event_Wrestling Men's Super-Heavyweight, Greco-Roman","event_Wrestling Men's Welterweight, Freestyle","event_Wrestling Men's Welterweight, Greco-Roman","event_Wrestling Women's Featherweight, Freestyle","event_Wrestling Women's Flyweight, Freestyle","event_Wrestling Women's Heavyweight, Freestyle","event_Wrestling Women's Light-Heavyweight, Freestyle","event_Wrestling Women's Lightweight, Freestyle","event_Wrestling Women's Middleweight, Freestyle",medal
0,18289.0,18.0,2008.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
1,57077.0,27.0,1980.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
2,8528.0,24.0,2012.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
3,82322.0,22.0,2004.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
4,10102.0,18.0,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal


In [68]:
df_2 = pd.read_csv('../data/final_data_two_classes.csv', index_col=[0])
df_2.head()

Unnamed: 0,id,year,sex_F,sex_M,noc_AFG,noc_AHO,noc_ALB,noc_ALG,noc_AND,noc_ANG,...,"event_Wrestling Men's Super-Heavyweight, Greco-Roman","event_Wrestling Men's Welterweight, Freestyle","event_Wrestling Men's Welterweight, Greco-Roman","event_Wrestling Women's Featherweight, Freestyle","event_Wrestling Women's Flyweight, Freestyle","event_Wrestling Women's Heavyweight, Freestyle","event_Wrestling Women's Light-Heavyweight, Freestyle","event_Wrestling Women's Lightweight, Freestyle","event_Wrestling Women's Middleweight, Freestyle",medal
0,18289.0,2008.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
1,57077.0,1980.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
2,8528.0,2012.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
3,82322.0,2004.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal
4,10102.0,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No medal


In [69]:
X_4 = df_4.drop('medal', axis=1)
y_4 = df_4.medal

data_4 = (X_train_4, X_test_4, y_train_4, y_test_4) = train_test_split(X_4, y_4, test_size=0.3, stratify=y_4, random_state=42)

In [70]:
X_2 = df_2.drop('medal', axis=1)
y_2 = df_2.medal

data_2 = (X_train_2, X_test_2, y_train_2, y_test_2) = train_test_split(X_2, y_2, test_size=0.3, stratify=y_2, random_state=42)

## Utility functions

In [71]:
def make_model(model, **kwargs):
  return model(**kwargs)

def make_pipeline(preprocessor, model):
  return Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
  ])

def evaluate_model(pipeline, data):
  X_train, X_test, y_train, y_test = data
  pipeline.fit(X_train, y_train)
  y_hat = pipeline.predict(X_test)
  score = f1_score(y_test, y_hat, average='macro')
  return (score, y_hat)

## 4 classes prediction

In [18]:
model_not_tuned = make_model(ExtraTreesClassifier, bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                      criterion='gini', max_depth=100, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

0.664885440314426

In [34]:
param_grid = {
  'bootstrap': [False],
  'max_depth': [10, 20, 100],
  'max_features': ['sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'class_weight': ['balanced'],
  'n_estimators': [400, 1800]
}

grid_search = GridSearchCV(ExtraTreesClassifier(), param_grid=param_grid, scoring='f1_macro', cv=5, n_jobs=-1)
grid_search.fit(X_train_4, y_train_4)

GridSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'class_weight': ['balanced'],
                         'max_depth': [10, 20, 100], 'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [400, 1800]},
             scoring='f1_macro')

In [35]:
grid_search.best_params_, grid_search.best_score_

({'bootstrap': False,
  'class_weight': 'balanced',
  'max_depth': 100,
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 1800},
 0.6757893838630664)

In [72]:
model = grid_search.best_estimator_
f1_score_4, y_hat_4 = evaluate_model(model, data_4)

In [73]:
f1_score_4

0.6773705921372625

In [44]:
model

ExtraTreesClassifier(class_weight='balanced', max_depth=100,
                     max_features='sqrt', n_estimators=1800)

In [46]:
print(classification_report(y_test_4, y_hat_4))

              precision    recall  f1-score   support

      Bronze       0.69      0.64      0.66      3116
        Gold       0.72      0.69      0.70      3036
    No medal       0.61      0.75      0.67      3300
      Silver       0.71      0.63      0.67      2991

    accuracy                           0.68     12443
   macro avg       0.69      0.68      0.68     12443
weighted avg       0.68      0.68      0.68     12443



## 2 classes prediction
with the same model

In [74]:
f1_score_2, y_hat_2 = evaluate_model(model, data_2)

In [75]:
f1_score_2

0.80804724554566

In [76]:
print(classification_report(y_test_2, y_hat_2))

              precision    recall  f1-score   support

       Medal       0.83      0.79      0.81      7601
    No medal       0.79      0.83      0.81      7278

    accuracy                           0.81     14879
   macro avg       0.81      0.81      0.81     14879
weighted avg       0.81      0.81      0.81     14879

