# Model Selection

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import csv
import joblib

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', None)

In [7]:
df = pd.read_csv('../data/processed_players.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56510 entries, 0 to 56509
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   club              56510 non-null  object 
 1   age               56510 non-null  int64  
 2   position          56510 non-null  object 
 3   mins              56510 non-null  int64  
 4   goals             56510 non-null  float64
 5   assists           56510 non-null  float64
 6   motm              56510 non-null  float64
 7   rating            56510 non-null  float64
 8   league            56510 non-null  object 
 9   traded            56510 non-null  int64  
 10  w_shots           56510 non-null  float64
 11  w_yel             56510 non-null  float64
 12  w_red             56510 non-null  float64
 13  w_aerials_won     56510 non-null  float64
 14  w_tackles         56510 non-null  float64
 15  w_interceptions   56510 non-null  float64
 16  w_fouls           56510 non-null  float6

We have to encode the categorical data, I will use one hot encoding

In [14]:
df

Unnamed: 0,age,mins,goals,assists,motm,rating,traded,w_shots,w_yel,w_red,w_aerials_won,w_tackles,w_interceptions,w_fouls,w_offsides_won,w_clearances,w_dribbled,w_blocks,w_own_goals,w_key_passes,w_dribblings,w_fouled,w_offsides,w_dispossed,w_bad_controls,w_avg_passes,w_crosses,w_long_passes,w_through_passes,w_ps_avg_passes,club_AC Milan,club_Almeria,club_Amiens,club_Angers,club_Arminia Bielefeld,club_Arsenal,club_Aston Villa,club_Atalanta,club_Athletic Bilbao,club_Atletico,club_Augsburg,club_Auxerre,club_Barcelona,club_Bari,club_Bayern,club_Birmingham,club_Blackburn,club_Bochum,club_Bologna,club_Bolton,club_Bordeaux,club_Borussia Dortmund,club_Borussia M.Gladbach,club_Bournemouth,club_Brentford,club_Brest,club_Brighton,club_Burnley,club_Cadiz,club_Caen,club_Cagliari,club_Catania,club_Celta Vigo,club_Cesena,club_Chelsea,club_Chievo,club_Clermont Foot,club_Crotone,club_Crystal Palace,club_Darmstadt,club_Deportivo,club_Deportivo Alaves,club_Dijon,club_Eibar,club_Eintracht Frankfurt,club_Elche,club_Empoli,club_Espanyol,club_Everton,club_Evian,club_FC Koln,club_Fiorentina,club_Fortuna Duesseldorf,club_Freiburg,club_Fulham,club_Genoa,club_Getafe,club_Girona,club_Granada,club_Guingamp,club_Hamburg,club_Hannover,club_Hertha Berlin,club_Hoffenheim,club_Huddersfield,club_Hull,club_Ingolstadt,club_Inter,club_Juventus,club_Kaiserslautern,club_Las Palmas,club_Lazio,club_Lecce,club_Leeds,club_Leganes,club_Leicester,club_Lens,club_Levante,club_Leverkusen,club_Lille,club_Liverpool,club_Lorient,club_Lyon,club_Mainz,club_Malaga,club_Mallorca,club_Man City,club_Man Utd,club_Marseille,club_Metz,club_Monaco,club_Montpellier,club_Nancy,club_Nantes,club_Napoli,club_Newcastle,club_Nice,club_Nimes,club_Norwich,club_Nuernberg,club_Osasuna,club_PSG,club_Palermo,club_Parma Calcio 1913,club_QPR,club_RBL,club_Racing Santander,club_Rayo Vallecano,club_Real Betis,club_Real Madrid,club_Real Sociedad,club_Real Valladolid,club_Real Zaragoza,club_Reims,club_Rennes,club_Roma,club_SC Bastia,club_SPAL 2013,club_Saint-Etienne,club_Salernitana,club_Sampdoria,club_Sassuolo,club_Schalke,club_Sevilla,club_Sheff Utd,club_Siena,club_Sochaux,club_Southampton,club_Spezia,club_Sporting Gijon,club_Stoke,club_Strasbourg,club_Stuttgart,club_Sunderland,club_Swansea,club_Torino,club_Tottenham,club_Toulouse,club_Troyes,club_Udinese,club_Union Berlin,club_Valencia,club_Valenciennes,club_Verona,club_Villarreal,club_Watford,club_Werder Bremen,club_West Brom,club_West Ham,club_Wigan,club_Wolfsburg,club_Wolves,position_AM(CL),position_AM(CLR),position_AM(CR),position_AM(L),position_AM(LR),position_AM(R),position_D(C),position_D(CL),position_D(CLR),position_D(CR),position_D(L),position_D(LR),position_D(R),position_DEFENDER,position_DMC,position_FORWARD,position_FW,position_M(C),position_M(CL),position_M(CLR),position_M(CR),position_M(L),position_M(LR),position_M(R),position_MIDFIELDER,league_LaLiga,league_Ligue 1,league_Premier League,league_Serie A,apps_cat_1,apps_cat_2,apps_cat_3,apps_cat_4,apps_cat_6,apps_cat_7,apps_cat_8,apps_cat_9,apps_cat_10
0,26,1779,16.0,6.0,8.0,8.179688,0,2.146484,0.631348,0.0,0.252686,0.378906,0.315674,0.568359,0.0,0.000000,0.442139,0.000000,0.0,1.136719,2.525391,1.389648,0.189453,1.326172,0.000000,21.656250,0.947266,1.262695,0.126343,28.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,26,1779,16.0,6.0,8.0,8.179688,0,2.146484,0.631348,0.0,0.252686,0.378906,0.315674,0.568359,0.0,0.000000,0.442139,0.000000,0.0,1.136719,2.525391,1.389648,0.189453,1.326172,0.000000,21.656250,0.947266,1.262695,0.126343,28.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,26,2924,21.0,5.0,6.0,7.789062,0,2.605469,2.605469,0.0,1.823242,1.041992,0.607910,2.171875,0.0,0.260498,0.434326,0.086853,0.0,1.389648,0.955078,2.691406,1.563477,1.910156,0.000000,28.218750,0.173706,0.347412,0.086853,23.968750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,26,2924,21.0,5.0,6.0,7.789062,0,2.605469,2.605469,0.0,1.823242,1.041992,0.607910,2.171875,0.0,0.260498,0.434326,0.086853,0.0,1.389648,0.955078,2.691406,1.563477,1.910156,0.000000,28.218750,0.173706,0.347412,0.086853,23.968750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,28,2768,10.0,13.0,8.0,7.738281,1,1.958008,5.710938,0.0,0.163208,1.549805,0.489502,1.223633,0.0,0.163208,0.979004,0.081604,0.0,2.691406,1.223633,1.712891,0.244751,2.529297,0.000000,41.687500,2.039062,4.648438,0.407959,40.343750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56505,20,12,0.0,0.0,0.0,5.859375,1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.026321,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.052643,0.000000,0.000000,0.000000,1.018555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56506,26,85,0.0,0.0,0.0,5.859375,1,0.000000,0.000000,0.0,0.026321,0.026321,0.026321,0.000000,0.0,0.026321,0.078918,0.000000,0.0,0.000000,0.000000,0.000000,0.026321,0.000000,0.052643,1.079102,0.000000,0.052643,0.000000,17.828125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56507,26,85,0.0,0.0,0.0,5.859375,1,0.000000,0.000000,0.0,0.026321,0.026321,0.026321,0.000000,0.0,0.026321,0.078918,0.000000,0.0,0.000000,0.000000,0.000000,0.026321,0.000000,0.052643,1.079102,0.000000,0.052643,0.000000,17.828125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56508,32,136,0.0,0.0,0.0,5.308594,0,0.026321,0.052643,0.0,0.000000,0.026321,0.000000,0.026321,0.0,0.105286,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.552734,0.000000,0.000000,0.000000,8.148438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# To float
to_float = df.select_dtypes(include="float64").columns

for x in to_float:
    df[x] = df[x].astype("float16")
    


In [9]:
# One hot encoder for cat variables
onehot_encoder_club = OneHotEncoder(handle_unknown='ignore',drop='first')
onehot_encoder_pos = OneHotEncoder(handle_unknown='ignore',drop='first')
onehot_encoder_league = OneHotEncoder(handle_unknown='ignore',drop='first')
onehot_encoder_apps = OneHotEncoder(handle_unknown='ignore',drop='first')

clubs_df = pd.DataFrame(onehot_encoder_club.fit_transform(df[['club']]).toarray() , 
                        columns=onehot_encoder_club.get_feature_names_out(['club']))

pos_df = pd.DataFrame(onehot_encoder_pos.fit_transform(df[['position']]).toarray(),
                        columns=onehot_encoder_pos.get_feature_names_out(['position']))

league_df = pd.DataFrame(onehot_encoder_league.fit_transform(df[['league']]).toarray(),
                        columns=onehot_encoder_league.get_feature_names_out(['league']))

apps_df = pd.DataFrame(onehot_encoder_apps.fit_transform(df[['apps_cat']]).toarray(),
                        columns=onehot_encoder_apps.get_feature_names_out(['apps_cat']))

df = df.join(clubs_df)
df = df.join(pos_df)
df = df.join(league_df)
df = df.join(apps_df)

df = df.drop(['club','position','league','apps_cat'],axis=1)

In [10]:
X = df.loc[:,df.columns != 'traded'].copy()
y = df.loc[:,df.columns == 'traded'].copy()

I will select a few models to see wich ones perform well.


In [None]:
models = [LogisticRegression(max_iter=1000), KNeighborsClassifier(), 
          RandomForestClassifier(random_state=33,max_depth=50),xgb.XGBClassifier(),AdaBoostClassifier()]

### Holdout

I will do a first attemp with the holdout method to quickly see which models would perform better.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=33)

In [23]:
#Scale down both datasets
scaler_train= MinMaxScaler()
scaled_X_train = scaler_train.fit_transform(X_train)

scaler_test= MinMaxScaler()
scaled_X_test = scaler_test.fit_transform(X_test)

In [None]:
for model_a in models:
        model = model_a
        model.fit(scaled_X_train,y_train.values.ravel())
        y_pred = model.predict(scaled_X_test)
        print('Model :',model_a)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print('---------------------------------------------------------------')

Model : LogisticRegression(max_iter=1000)
              precision    recall  f1-score   support

           0       0.74      0.96      0.84      8241
           1       0.51      0.11      0.18      3061

    accuracy                           0.73     11302
   macro avg       0.62      0.54      0.51     11302
weighted avg       0.68      0.73      0.66     11302

[[7910  331]
 [2722  339]]
---------------------------------------------------------------
Model : KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.79      0.88      0.84      8241
           1       0.55      0.39      0.45      3061

    accuracy                           0.75     11302
   macro avg       0.67      0.63      0.65     11302
weighted avg       0.73      0.75      0.73     11302

[[7287  954]
 [1881 1180]]
---------------------------------------------------------------
Model : RandomForestClassifier(max_depth=50, random_state=33)
              precision    re

The best models are Random Forest and KNN. Probably they are overfitting but this is a problem that we would solve later. For now we will study deeper this 2 models.

Firstly we want to try which technique wouls work better to solve the imbalance data problematic. We are going to try SMOTE, SMOTETomek and just Stratified K Fold.

### SMOTE

In [None]:
models = [ KNeighborsClassifier(), RandomForestClassifier(random_state=33,max_depth=50)]

I will scale down the values of X

In [11]:
scaler_X= MinMaxScaler()
scaled_X = scaler_X.fit_transform(X)


In [None]:
def compare_models_smote(smote,x,y):
    scoring = ['precision', 'recall', 'f1']
    for model in models:
        if smote:
            steps = [('over', SMOTE()), ('model', model)]
        else:
            steps = [('over', SMOTETomek(sampling_strategy=0.75)), ('model', model)]
            
        pipeline = Pipeline(steps=steps)
        
        kfold = RepeatedStratifiedKFold(n_splits=5,n_repeats=3,random_state=33)
            
        results = cross_validate(pipeline, x, y.values.ravel(), cv=kfold,scoring=scoring)
       
        precision = results['test_precision']
        recall = results['test_recall']
        f1 = results['test_f1']
        
        print('Model :',model)
        print("Precision mean =", np.mean(precision), "std =",np.std(precision))
        print("Recall mean =", np.mean(recall), "std =",np.std(recall))
        print("F1-Score mean =", np.mean(f1), "std =",np.std(f1))
        print('---------------------------------------------------------------')

In [None]:
def compare_models(x,y):
    scoring = ['precision', 'recall', 'f1']
    for model in models:
        
        kfold = RepeatedStratifiedKFold(n_splits=5,n_repeats=3,random_state=33)
            
        results = cross_validate(model, x, y.values.ravel(), cv=kfold,scoring=scoring)
       
        precision = results['test_precision']
        recall = results['test_recall']
        f1 = results['test_f1']
        
        print('Model :',model)
        print("Precision mean =", np.mean(precision), "std =",np.std(precision))
        print("Recall mean =", np.mean(recall), "std =",np.std(recall))
        print("F1-Score mean =", np.mean(f1), "std =",np.std(f1))
        print('---------------------------------------------------------------')

In [None]:
compare_models_smote(smote=True,x=scaled_X,y=y)

Model : KNeighborsClassifier()
Precision mean = 0.4344121228063725 std = 0.004999282303671559
Recall mean = 0.7390566073836634 std = 0.00882745624528864
F1-Score mean = 0.5471701492268769 std = 0.005487026676754981
---------------------------------------------------------------
Model : RandomForestClassifier(max_depth=50, random_state=33)
Precision mean = 0.9351575199386986 std = 0.005998913367374764
Recall mean = 0.8534343236262901 std = 0.008963745809873507
F1-Score mean = 0.8924055322864948 std = 0.006367887341456651
---------------------------------------------------------------


### SMOTETomek

In [None]:
compare_models_smote(smote=False,x=scaled_X,y=y)

Model : KNeighborsClassifier()
Precision mean = 0.43744494597917455 std = 0.004471207391905086
Recall mean = 0.6733742944908638 std = 0.009754629247805674
F1-Score mean = 0.5303277839321489 std = 0.005178178968429531
---------------------------------------------------------------
Model : RandomForestClassifier(max_depth=50, random_state=33)
Precision mean = 0.9384977136334952 std = 0.006304899918911443
Recall mean = 0.8016466117886165 std = 0.010308873031732436
F1-Score mean = 0.8646571821395276 std = 0.007234219453985614
---------------------------------------------------------------


### Stratified KFold

In [None]:
compare_models(x=scaled_X,y=y)

Model : KNeighborsClassifier()
Precision mean = 0.5581685850601972 std = 0.006503354388943728
Recall mean = 0.39158956931535016 std = 0.011564295721753505
F1-Score mean = 0.46019057831575033 std = 0.009150903851754537
---------------------------------------------------------------
Model : RandomForestClassifier(max_depth=50, random_state=33)
Precision mean = 0.9848104227409387 std = 0.004430961509284331
Recall mean = 0.8182195638392102 std = 0.010198446642670686
F1-Score mean = 0.8937832031097089 std = 0.006539042313802806
---------------------------------------------------------------


Based on the results, I would want to develop a Random Forest Classifier. I will use StratifiedKFold since the results are the best but primarly because there is not a modification of the original dataset unlike with SMOTE or SMOTETomek. Similar results, I am always going with no modification methods.

Anyways, I think some overfitting is happening.

### Hyperparameter Tuning

#### Random Forest

Random Search CV to get clue of where to start.

In [None]:

param_grid = {
    'max_depth': [30,50,100,200],
    'min_samples_leaf': [1, 2, 3,4],
    'min_samples_split': [2, 5, 10,20],
    'n_estimators': [100, 500, 700,1200],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=33)
rf = RandomForestClassifier()
grid_search = RandomizedSearchCV(estimator = rf,n_iter= 25 ,param_distributions = param_grid,scoring='f1', 
                          cv = kfold, n_jobs = -1, verbose = 2)

In [None]:
best_model_rf = grid_search.fit(scaled_X_train,y_train.values.ravel())

Fitting 3 folds for each of 25 candidates, totalling 75 fits




In [None]:
print("Best params: ",best_model_rf.best_params_)
print("Best score: ",best_model_rf.best_score_)

Best params:  {'n_estimators': 700, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 100, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.7185585455402602


### Evaluation

In [None]:
rf_ev = RandomForestClassifier(n_estimators=700, min_samples_split= 10, min_samples_leaf= 1, max_depth= 100, criterion= 'entropy', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92      9555
           1       0.56      0.97      0.71      1747

    accuracy                           0.88     11302
   macro avg       0.78      0.92      0.81     11302
weighted avg       0.93      0.88      0.89     11302

[[8196 1359]
 [  45 1702]]


In [None]:
y_train_pred = rf_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32968
           1       1.00      1.00      1.00     12240

    accuracy                           1.00     45208
   macro avg       1.00      1.00      1.00     45208
weighted avg       1.00      1.00      1.00     45208

[[32963     5]
 [    0 12240]]


The model is overfitting and I runed a deeper tuning in another machine and It gave the next results:

In [None]:
rf_ev = RandomForestClassifier(n_estimators=1200, min_samples_split= 2, min_samples_leaf= 1, max_depth= 50, criterion= 'gini', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.92      0.96      8876
           1       0.78      0.98      0.87      2426

    accuracy                           0.94     11302
   macro avg       0.89      0.95      0.91     11302
weighted avg       0.95      0.94      0.94     11302

[[8199  677]
 [  42 2384]]


In [None]:
y_train_pred = rf_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32964
           1       1.00      1.00      1.00     12244

    accuracy                           1.00     45208
   macro avg       1.00      1.00      1.00     45208
weighted avg       1.00      1.00      1.00     45208

[[32963     1]
 [    0 12244]]


The results are really good, better than teh first one but there is also some overfiting, the model fits perfectly the train set but the precision gap between the 2 predictions is really high. I guess it is because the max depth still being really high. I will try with lower values.

In [None]:
rf_ev = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf = 1, max_depth = 8, criterion= 'gini', bootstrap= False)
rf_ev.fit(scaled_X_train,y_train.values.ravel())
y_pred = rf_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.64      0.82      0.72      6506
           1       0.61      0.39      0.48      4796

    accuracy                           0.64     11302
   macro avg       0.63      0.60      0.60     11302
weighted avg       0.63      0.64      0.62     11302

[[5313 1193]
 [2928 1868]]


In [None]:
y_train_pred = rf_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       0.66      0.83      0.74     25973
           1       0.65      0.41      0.51     19235

    accuracy                           0.66     45208
   macro avg       0.65      0.62      0.62     45208
weighted avg       0.65      0.66      0.64     45208

[[21682  4291]
 [11281  7954]]


After some test I found out that the best max depth is 8, this as we can see doesn´t overfit using a Holdout method. To confirm this I will use this model with Stratified K Fold and check wether overfits or not.

### Stratified k fold

In [None]:

clf = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf= 1, max_depth= 8, criterion= 'gini', bootstrap= False)


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

y_train=np.array(y_train)
# Loop through the folds
for train_index, test_index in skf.split(scaled_X_train, y_train):

    X_train_skf, X_test_skf = scaled_X_train[train_index], scaled_X_train[test_index]
    y_train_skf, y_test_skf = y_train[train_index], y_train[test_index]


    clf.fit(X_train_skf, y_train_skf.ravel())


    y_pred = clf.predict(X_test_skf)


    cm = confusion_matrix(y_pred, y_test_skf)
    cr =classification_report(y_pred, y_test_skf)

    print('TEST')
    print(cr)
    print(cm)
    y_pred_train = clf.predict(X_train_skf)
    print('----------------------------------------------------------')
    print('TRAIN')

    cm2 = confusion_matrix(y_pred_train, y_train_skf)
    cr2 =classification_report(y_pred_train, y_train_skf)

    print(cr2)
    print(cm2)

TEST
              precision    recall  f1-score   support

           0       0.64      0.81      0.71      2594
           1       0.60      0.38      0.46      1927

    accuracy                           0.63      4521
   macro avg       0.62      0.60      0.59      4521
weighted avg       0.62      0.63      0.61      4521

[[2102  492]
 [1195  732]]
----------------------------------------------------------
TRAIN
              precision    recall  f1-score   support

           0       0.66      0.84      0.74     23464
           1       0.65      0.42      0.51     17223

    accuracy                           0.66     40687
   macro avg       0.66      0.63      0.62     40687
weighted avg       0.66      0.66      0.64     40687

[[19631  3833]
 [10035  7188]]
TEST
              precision    recall  f1-score   support

           0       0.66      0.83      0.73      2600
           1       0.64      0.41      0.50      1921

    accuracy                           0.65      

In [None]:
# Make predictions on the test data
y_pred = clf.predict(scaled_X_test)
# Compute the confusion matrix
cm = confusion_matrix(y_pred, y_test)
cr =classification_report(y_pred, y_test)
print(cr)
print(cm)

              precision    recall  f1-score   support

           0       0.65      0.82      0.72      6543
           1       0.61      0.39      0.48      4759

    accuracy                           0.64     11302
   macro avg       0.63      0.60      0.60     11302
weighted avg       0.63      0.64      0.62     11302

[[5340 1203]
 [2901 1858]]


As we can see, this model give a good realation between bias and variance. ESCRIBIR MAS ACERCA DE LOS RESULTADOS.

### EXTRA TEST

Even though we saw that without outsampling or oversampling the results are really similar, I would like to test this out with this model. As I said before, at similar perfomarnce we don´t want to use any modification technique.

In [None]:
rf_smt = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf = 1, max_depth = 8, criterion= 'gini', bootstrap= False)
smt=SMOTETomek(sampling_strategy=0.75)
scaled_X_train_smt,y_train_smt = smt.fit_resample(scaled_X_train,y_train)
rf_smt.fit(scaled_X_train_smt,y_train_smt.ravel())
y_pred = rf_smt.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.67      0.81      0.73      6747
           1       0.59      0.40      0.47      4555

    accuracy                           0.65     11302
   macro avg       0.63      0.60      0.60     11302
weighted avg       0.63      0.65      0.63     11302

[[5488 1259]
 [2753 1802]]


In [None]:
y_pred_train = rf_smt.predict(scaled_X_train)
print(classification_report(y_pred_train, y_train))
print(confusion_matrix(y_pred_train, y_train))

              precision    recall  f1-score   support

           0       0.69      0.83      0.75     27400
           1       0.61      0.42      0.50     17808

    accuracy                           0.67     45208
   macro avg       0.65      0.62      0.62     45208
weighted avg       0.66      0.67      0.65     45208

[[22635  4765]
 [10328  7480]]


The results are quite similar as we expected.

#### KNN

The second best model was KNN so I think it is a good idea to try KNN out. If both models perfoms similar we would want to use the quickest one if the gap between both process times is big enough.

In [25]:
param_grid = {
    'leaf_size' : [5,9,13,15,21],
    'n_neighbors' : [3,7,13,23,29],
    'p':[1,2],
}
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=33)
knn= KNeighborsClassifier()
grid_search = RandomizedSearchCV(estimator = knn,n_iter= 25 ,param_distributions = param_grid,scoring='f1', 
                          cv = kfold, n_jobs = -1, verbose = 2)

In [28]:
best_model_knn = grid_search.fit(scaled_X_train,y_train.ravel())

Fitting 3 folds for each of 25 candidates, totalling 75 fits




In [29]:
print("Best params: ",best_model_knn.best_params_)
print("Best score: ",best_model_knn.best_score_)

Best params:  {'p': 2, 'n_neighbors': 3, 'leaf_size': 15}
Best score:  0.39201130704609416


In [31]:
knn_ev = KNeighborsClassifier(p= 2,n_neighbors= 3, leaf_size=15)
knn_ev.fit(scaled_X_train,y_train.ravel())
y_pred = knn_ev.predict(scaled_X_test)
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      8490
           1       0.36      0.39      0.38      2812

    accuracy                           0.68     11302
   macro avg       0.58      0.58      0.58     11302
weighted avg       0.69      0.68      0.68     11302

[[6535 1955]
 [1706 1106]]


In [32]:
y_train_pred = knn_ev.predict(scaled_X_train)
print(classification_report(y_train_pred, y_train))
print(confusion_matrix(y_train_pred, y_train))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     33200
           1       0.88      0.89      0.89     12008

    accuracy                           0.94     45208
   macro avg       0.92      0.92      0.92     45208
weighted avg       0.94      0.94      0.94     45208

[[31695  1505]
 [ 1268 10740]]


It is overfitting but we will not try to solve because we can already see that it won´t perform better than the Random Forest.

### Final Model

We will go for a Random Forest trained with StratifiedKFold since gives the best perfomance without overfitting.

In [None]:
rf_final = RandomForestClassifier(class_weight='balanced',n_estimators=1200, min_samples_split= 2, min_samples_leaf= 1, max_depth= 8, criterion= 'gini', bootstrap= False)
rf_final.fit(scaled_X,y.values.ravel())

In [None]:
joblib.dump(rf_final, 'random_forest.pkl')

We also need to export the encoders and the scaler

In [12]:
# export encoders
joblib.dump(onehot_encoder_club, 'encoder_club.joblib')
joblib.dump(onehot_encoder_pos, 'encoder_position.joblib')
joblib.dump(onehot_encoder_league, 'encoder_league.joblib')
joblib.dump(onehot_encoder_apps, 'encoder_apps.joblib')

['encoder_apps.joblib']

In [13]:
# export scaler
joblib.dump(scaler_X, 'scaler.joblib')

['scaler.joblib']