In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
import csv
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/processed_players.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56510 entries, 0 to 56509
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   club              56510 non-null  object 
 1   age               56510 non-null  int64  
 2   position          56510 non-null  object 
 3   mins              56510 non-null  int64  
 4   goals             56510 non-null  float64
 5   assists           56510 non-null  float64
 6   motm              56510 non-null  float64
 7   rating            56510 non-null  float64
 8   league            56510 non-null  object 
 9   traded            56510 non-null  int64  
 10  w_shots           56510 non-null  float64
 11  w_yel             56510 non-null  float64
 12  w_red             56510 non-null  float64
 13  w_aerials_won     56510 non-null  float64
 14  w_tackles         56510 non-null  float64
 15  w_interceptions   56510 non-null  float64
 16  w_fouls           56510 non-null  float6

We have to encode the categorical data, I will use one hot encoding

In [4]:
# To float
to_float = df.select_dtypes(include="float64").columns

for x in to_float:
    df[x] = df[x].astype("float16")
    


In [5]:
df= pd.get_dummies(df, columns = ['club','position','league','apps_cat'])
# Delete n - 1
df = df.drop(['apps_cat_10','league_Premier League','position_MIDFIELDER','club_Wolfsburg'],axis=1)

In [6]:
X = df.loc[:,df.columns != 'traded'].copy()
y = df.loc[:,df.columns == 'traded'].copy()

I will select a few models to see wich ones perform well.


In [7]:
models = [LogisticRegression(max_iter=1000), KNeighborsClassifier(), 
          RandomForestClassifier(random_state=33,max_depth=20),xgb.XGBClassifier(),AdaBoostClassifier()]

### Holdout

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=33)

In [8]:
scaler_train= MinMaxScaler()
scaled_X_train = scaler_train.fit_transform(X_train)

scaler_test= MinMaxScaler()
scaled_X_test = scaler_test.fit_transform(X_test)

In [10]:
for model_a in models:
        model = model_a
        model.fit(scaled_X_train,y_train.values.ravel())
        y_pred = model.predict(scaled_X_test)
        print('Model :',model_a)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print('---------------------------------------------------------------')

Model : LogisticRegression(max_iter=1000)
              precision    recall  f1-score   support

           0       0.74      0.96      0.84      8241
           1       0.51      0.11      0.18      3061

    accuracy                           0.73     11302
   macro avg       0.62      0.54      0.51     11302
weighted avg       0.68      0.73      0.66     11302

[[7910  331]
 [2722  339]]
---------------------------------------------------------------
Model : KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.79      0.88      0.84      8241
           1       0.55      0.39      0.45      3061

    accuracy                           0.75     11302
   macro avg       0.67      0.63      0.65     11302
weighted avg       0.73      0.75      0.73     11302

[[7287  954]
 [1881 1180]]
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
              precision    recall  f1-score

I will try to manage the imbalance using 3 techniques and observe wich one perfoms better. This 3 techniques will be SMOTE, SMOTETomek and StratifiedKFold

### SMOTE

I will scale down the values of X

In [39]:
scaler= MinMaxScaler()
scaled_X = scaler.fit_transform(X)


In [29]:
def compare_models_smote(smote,x,y):
    scoring = ['precision', 'recall', 'f1']
    for model in models:
        if smote:
            steps = [('over', SMOTE()), ('model', model)]
        else:
            steps = [('over', SMOTETomek(sampling_strategy=0.75)), ('model', model)]
            
        pipeline = Pipeline(steps=steps)
        
        kfold = RepeatedStratifiedKFold(n_splits=5,n_repeats=3,random_state=33)
            
        results = cross_validate(pipeline, x, y.values.ravel(), cv=kfold,scoring=scoring)
       
        precision = results['test_precision']
        recall = results['test_recall']
        f1 = results['test_f1']
        
        print('Model :',model)
        print("Precision mean =", np.mean(precision), "std =",np.std(precision))
        print("Recall mean =", np.mean(recall), "std =",np.std(recall))
        print("F1-Score mean =", np.mean(f1), "std =",np.std(f1))
        print('---------------------------------------------------------------')

In [37]:
def compare_models(x,y):
    scoring = ['precision', 'recall', 'f1']
    for model in models:
        
        kfold = RepeatedStratifiedKFold(n_splits=5,n_repeats=3,random_state=33)
            
        results = cross_validate(model, x, y.values.ravel(), cv=kfold,scoring=scoring)
       
        precision = results['test_precision']
        recall = results['test_recall']
        f1 = results['test_f1']
        
        print('Model :',model)
        print("Precision mean =", np.mean(precision), "std =",np.std(precision))
        print("Recall mean =", np.mean(recall), "std =",np.std(recall))
        print("F1-Score mean =", np.mean(f1), "std =",np.std(f1))
        print('---------------------------------------------------------------')

In [18]:
compare_models_smote(smote=True,x=scaled_X,y=y)

Model : LogisticRegression(max_iter=1000)
Precision mean = 0.3679508100902268 std = 0.002763481146665256
Recall mean = 0.6077137505988937 std = 0.008246111599569948
F1-Score mean = 0.4583583445645678 std = 0.003935814824372998
---------------------------------------------------------------
Model : KNeighborsClassifier()
Precision mean = 0.40480077523101654 std = 0.003240666700309056
Recall mean = 0.7216995513741887 std = 0.00305947285662894
F1-Score mean = 0.5186729775023988 std = 0.0033217711766695673
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
Precision mean = 0.8819563777977755 std = 0.004864389375421596
Recall mean = 0.7531033581601986 std = 0.005909012964638708
F1-Score mean = 0.812440390494457 std = 0.004595866648493326
---------------------------------------------------------------
Model : XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsa

### SMOTETomek

In [30]:
compare_models_smote(smote=False,x=scaled_X,y=y)

Model : LogisticRegression(max_iter=1000)
Precision mean = 0.4109217359241721 std = 0.00678392736948578
Recall mean = 0.43166120795298557 std = 0.010941761325529493
F1-Score mean = 0.420990639150891 std = 0.007796358040504468
---------------------------------------------------------------
Model : KNeighborsClassifier()
Precision mean = 0.43779702618547767 std = 0.004476855182752391
Recall mean = 0.6726774540721562 std = 0.007132776426834519
F1-Score mean = 0.5303753453350407 std = 0.004345688237348155
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
Precision mean = 0.9370012376564391 std = 0.004437663830113479
Recall mean = 0.7997080838254144 std = 0.009911009599979186
F1-Score mean = 0.8628880953845979 std = 0.005906226997868003
---------------------------------------------------------------
Model : XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsa

### Stratified KFold

In [33]:
compare_models(x=scaled_X,y=y)

Model : LogisticRegression(max_iter=1000)
Precision mean = 0.49050079370906347 std = 0.018642417420074858
Recall mean = 0.06559524518262917 std = 0.004110766819022617
F1-Score mean = 0.11566875540682088 std = 0.006539502634903284
---------------------------------------------------------------
Model : KNeighborsClassifier()
Precision mean = 0.5581861604399022 std = 0.0065149074002714205
Recall mean = 0.39158956931535016 std = 0.011564295721753505
F1-Score mean = 0.4601965057301159 std = 0.009152033881228915
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
Precision mean = 0.985017653413404 std = 0.0033144227076752387
Recall mean = 0.8170872070498031 std = 0.01113676422819419
F1-Score mean = 0.8931858934429454 std = 0.006847838680280675
---------------------------------------------------------------
Model : XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, c

Based on the results, I would want to develop a Random Forest Classifier. I will use StratifiedKFold since the results are the best but primarly because there is not a modification of the original dataset unlike with SMOTE or SMOTETomek. Similar results, I am always goign with no modification methods.

Anyways, I think some overfitti is happening and I guess the reason may be the max depth of the trees taht by default is set to None.

### Hyperparameter Tuning

#### Random Forest

Random Search CV to get clue of where to start.

In [41]:

param_grid = {
    'max_depth': [30,50,100,200],
    'min_samples_leaf': [1, 2, 3,4],
    'min_samples_split': [2, 5, 10,20],
    'n_estimators': [100, 500, 700,1200],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=33)
rf = RandomForestClassifier()
grid_search = RandomizedSearchCV(estimator = rf,n_iter= 25 ,param_distributions = param_grid,scoring='f1', 
                          cv = kfold, n_jobs = -1, verbose = 2)

In [42]:
best_model_rf = grid_search.fit(scaled_X_train,y_train.values.ravel())

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [47]:
print("Best params: ",best_model_rf.best_params_)
print("Best score: ",best_model_rf.best_score_)

Best params:  {'n_estimators': 1200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 50, 'criterion': 'gini', 'bootstrap': False}
Best score:  0.7187652609744731


### Evaluation

In [59]:
rf_ev = RandomForestClassifier(n_estimators=1200, min_samples_split= 2, min_samples_leaf= 1, max_depth= 50, criterion= 'gini', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      8874
           1       0.78      0.98      0.87      2428

    accuracy                           0.94     11302
   macro avg       0.89      0.95      0.91     11302
weighted avg       0.95      0.94      0.94     11302

[[8203  671]
 [  38 2390]]


In [58]:
y_train_pred = rf_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32964
           1       1.00      1.00      1.00     12244

    accuracy                           1.00     45208
   macro avg       1.00      1.00      1.00     45208
weighted avg       1.00      1.00      1.00     45208

[[32963     1]
 [    0 12244]]


The results are really good but there is some overfiting, the model fits perfectly the train set but the precision gap between the 2 predictions is really high. I guess it is because the max depth still being really high. I will try with lower values.

In [21]:
rf_ev = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf = 1, max_depth = 8, criterion= 'gini', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.65      0.82      0.72      6523
           1       0.61      0.39      0.48      4779

    accuracy                           0.64     11302
   macro avg       0.63      0.60      0.60     11302
weighted avg       0.63      0.64      0.62     11302

[[5326 1197]
 [2915 1864]]


In [18]:
y_train_pred = rf_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       0.66      0.84      0.74     26027
           1       0.65      0.42      0.51     19181

    accuracy                           0.66     45208
   macro avg       0.66      0.63      0.62     45208
weighted avg       0.66      0.66      0.64     45208

[[21750  4277]
 [11213  7968]]


This looks good, now not overfitting is happening but I would like to do a last check

# PROBAR SMOTETOMKEK CON TRAIN DATASET Y PREDECIR Y (TEST SIN SMOTETEK)

#### KNN

In [31]:
param_grid = {
    'leaf_size' : [5,9,13,15,21],
    'n_neighbors' : [3,7,13,23,29],
    'p':[1,2],
}
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=33)
knn= KNeighborsClassifier()
grid_search = RandomizedSearchCV(estimator = knn,n_iter= 25 ,param_distributions = param_grid,scoring='f1', 
                          cv = kfold, n_jobs = -1, verbose = 2)

In [32]:
best_model_knn = grid_search.fit(scaled_X_train,y_train.values.ravel())

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [33]:
print("Best params: ",best_model_knn.best_params_)
print("Best score: ",best_model_knn.best_score_)

Best params:  {'p': 2, 'n_neighbors': 3, 'leaf_size': 5}
Best score:  0.39201130704609416


In [34]:
knn_ev = KNeighborsClassifier(p= 2,n_neighbors= 3, leaf_size=5)
knn_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = knn_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      8490
           1       0.36      0.39      0.38      2812

    accuracy                           0.68     11302
   macro avg       0.58      0.58      0.58     11302
weighted avg       0.69      0.68      0.68     11302

[[6535 1955]
 [1706 1106]]


In [35]:
y_train_pred = knn_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     33200
           1       0.88      0.89      0.89     12008

    accuracy                           0.94     45208
   macro avg       0.92      0.92      0.92     45208
weighted avg       0.94      0.94      0.94     45208

[[31695  1505]
 [ 1268 10740]]


It is overfitting but we will not try to solve because we can already see that it won´t perform better than the Random Forest.

### Final Model

We will go for a Random Forest trained with StratifiedKFold since gives the best perfomance without overfitting.

In [None]:
rf_ev = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf = 1, max_depth = 8, criterion= 'gini', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))