In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/processed_players.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56510 entries, 0 to 56509
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   club              56510 non-null  object 
 1   age               56510 non-null  int64  
 2   position          56510 non-null  object 
 3   mins              56510 non-null  int64  
 4   goals             56510 non-null  float64
 5   assists           56510 non-null  float64
 6   motm              56510 non-null  float64
 7   rating            56510 non-null  float64
 8   league            56510 non-null  object 
 9   traded            56510 non-null  int64  
 10  w_shots           56510 non-null  float64
 11  w_yel             56510 non-null  float64
 12  w_red             56510 non-null  float64
 13  w_aerials_won     56510 non-null  float64
 14  w_tackles         56510 non-null  float64
 15  w_interceptions   56510 non-null  float64
 16  w_fouls           56510 non-null  float6

We have to encode the categorical data, I will use one hot encoding

In [4]:
# To float
to_float = df.select_dtypes(include="float64").columns

for x in to_float:
    df[x] = df[x].astype("float16")
    


In [5]:
df= pd.get_dummies(df, columns = ['club','position','league','apps_cat'])
# Delete n - 1
df = df.drop(['apps_cat_10','league_Premier League','position_MIDFIELDER','club_Wolfsburg'],axis=1)

In [6]:
X = df.loc[:,df.columns != 'traded'].copy()
y = df.loc[:,df.columns == 'traded'].copy()

I will select a few models to see wich ones perform well.


In [7]:
models = [LogisticRegression(max_iter=1000), KNeighborsClassifier(), 
          RandomForestClassifier(random_state=33),xgb.XGBClassifier(),AdaBoostClassifier()]

I will try to manage the imbalance using 2 techniques and observe wich one perfoms better. This 2 techniques will be SMOTE and StratifiedKFold. I would prefer to don´t use Smote because oversampling tend to overfit.

### SMOTE

In [8]:
oversample = SMOTE()
X_smote, y_smote = oversample.fit_resample(X, y)


I will scale down the values of X

In [9]:
scaler= MinMaxScaler()
scaled_X_smote = scaler.fit_transform(X_smote)

In [20]:
def compare_models(kfold,x,y):
    scoring = ['precision', 'recall', 'f1']
    for model in models:
        if kfold:
            kfold = KFold(5)
        else:
            kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=33)
            
        results = cross_validate(model, x, y.values.ravel(), cv=kfold,scoring=scoring)
       
        precision = results['test_precision']
        recall = results['test_recall']
        f1 = results['test_f1']
        
        print('Model :',model)
        print("Precision mean =", np.mean(precision), "std =",np.std(precision))
        print("Recall mean =", np.mean(recall), "std =",np.std(recall))
        print("F1-Score mean =", np.mean(f1), "std =",np.std(f1))
        print('---------------------------------------------------------------')

In [14]:
compare_models(kfold=True,x=scaled_X_smote,y=y_smote)

Model : LogisticRegression(max_iter=1000)
Precision mean = 0.5217696066801734 std = 0.319274194421967
Recall mean = 0.7939486795987744 std = 0.017469401052132304
F1-Score mean = 0.5758157935027822 std = 0.22024035271752884
---------------------------------------------------------------
Model : KNeighborsClassifier()
Precision mean = 0.5176330996896358 std = 0.3133827362173911
Recall mean = 0.9064004314067386 std = 0.015570910255175137
F1-Score mean = 0.6089142351487975 std = 0.24441097072309204
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
Precision mean = 0.5643639459303016 std = 0.3106424011189775
Recall mean = 0.9364556764622692 std = 0.010762482089829458
F1-Score mean = 0.6579395984517982 std = 0.2348559276421165
---------------------------------------------------------------
Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
             

### Stratified KFold

In [15]:
scaler= MinMaxScaler()
scaled_X = scaler.fit_transform(X)


In [21]:
compare_models(kfold=False,x=scaled_X,y=y)

Model : LogisticRegression(max_iter=1000)
Precision mean = 0.4907451716578676 std = 0.02039466949708226
Recall mean = 0.0655295940949016 std = 0.003962203025390826
F1-Score mean = 0.11559527831681046 std = 0.006542088451075534
---------------------------------------------------------------
Model : KNeighborsClassifier()
Precision mean = 0.3301110862215878 std = 0.024543586001164755
Recall mean = 0.19431472569256664 std = 0.03153334544650518
F1-Score mean = 0.24354193880443695 std = 0.029919061365110074
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
Precision mean = 0.5032479841545274 std = 0.02078901467911748
Recall mean = 0.04943223864171571 std = 0.006809517148895987
F1-Score mean = 0.08980213786886987 std = 0.011055086904675653
---------------------------------------------------------------
Model : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,


### Holdout

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=33)

In [24]:
scaler_train= MinMaxScaler()
scaled_X_train = scaler_train.fit_transform(X_train)

scaler_test= MinMaxScaler()
scaled_X_test = scaler_test.fit_transform(X_test)

In [26]:
for model_a in models:
        model = model_a
        model.fit(scaled_X_train,y_train.values.ravel())
        y_pred = model.predict(scaled_X_test)
        print('Model :',model_a)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        print('---------------------------------------------------------------')

Model : LogisticRegression(max_iter=1000)
              precision    recall  f1-score   support

           0       0.74      0.96      0.84      8241
           1       0.51      0.11      0.18      3061

    accuracy                           0.73     11302
   macro avg       0.62      0.54      0.51     11302
weighted avg       0.68      0.73      0.66     11302

[[7910  331]
 [2722  339]]
---------------------------------------------------------------
Model : KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.79      0.88      0.84      8241
           1       0.55      0.39      0.45      3061

    accuracy                           0.75     11302
   macro avg       0.67      0.63      0.65     11302
weighted avg       0.73      0.75      0.73     11302

[[7287  954]
 [1881 1180]]
---------------------------------------------------------------
Model : RandomForestClassifier(random_state=33)
              precision    recall  f1-score

Estudiar bien que hacer aqui