In [66]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
from sklearn.utils.validation import column_or_1d
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [57]:
%run ./ml_model_init.ipynb

In [21]:
#
yc_data= data[(data['minutes_5'] >= 300) & (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]

yc_data_tar = data_tar[(data_tar['minutes_5'] >= 300) & (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]

# players = data_225 #.drop('position', axis=1) #  & (data['position'] == 'FWD')
# # players


In [None]:

yc = yc_data[['yellow_cards']]
feats = yc_data.drop('yellow_cards', axis=1)
feats_train, feats_test, yc_train, yc_test = train_test_split(feats, yc , test_size=0.2, random_state=42)


## Classifiers


### Logistic Regression


In [50]:
preds = Logistic_regression(feats_train, feats_test, column_or_1d(yc_train), column_or_1d(yc_test))

Class distribution: {0: 1320, 1: 266}


In [53]:
print(preds['class_report'])

              precision    recall  f1-score   support

           0       0.86      0.55      0.67      1320
           1       0.20      0.56      0.29       266

    accuracy                           0.55      1586
   macro avg       0.53      0.55      0.48      1586
weighted avg       0.75      0.55      0.61      1586



In [None]:
tn, fp, fn, tp = preds['conf_mat'].ravel()
(tn, fp, fn, tp)

(649, 547, 184, 232)

## RandomForestClassifier


In [60]:
rf_preds = Random_Forest_Classifier(feats_train, feats_test, column_or_1d(yc_train), column_or_1d(yc_test))

In [61]:
print(rf_preds['class_report'])

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      1320
           1       0.24      0.08      0.12       266

    accuracy                           0.80      1586
   macro avg       0.54      0.51      0.51      1586
weighted avg       0.74      0.80      0.76      1586



In [64]:
tn, fp, fn, tp = rf_preds['conf_mat'].ravel()
(tn, fp, fn, tp)

(1250, 70, 244, 22)

### Hyperparameter Tuning


In [68]:
# Create a RandomForestClassifier model
rf = RandomForestClassifier(random_state=42)

encoder = LabelEncoder()
yc_train_ = column_or_1d(encoder.fit_transform(yc_train))
yc_test_ = column_or_1d(encoder.transform(yc_test))

# bool_cols = feats_train.drop(columns=['was_home']).columns.tolist()
# categorical_cols = ['was_home']
bool_cols = feats_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = feats_train.select_dtypes(include=['object', 'category']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
    # ('to_dense', ToDense())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', BorderlineSMOTE(sampling_strategy='auto', random_state=42)),  # Apply SMOTE to the data
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))  # Random Forest Classifier
])


In [70]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__bootstrap': [True, False]  # Use 'model__' to specify the model's hyperparameters
}

# Setting up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy')

# Fit the model to the data
grid_search.fit(feats_train, column_or_1d(yc_train))

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")


Best Hyperparameters: {'classifier__bootstrap': False, 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}


## Baseline Model


#### Linear Model


In [6]:

lin_reg = Linear_regression(feats_train, feats_test, yc_train, yc_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = pd.DataFrame({"lin_reg": [lin_reg['train_MAE'], lin_reg['test_MAE'], lin_reg['train_RMSE'], lin_reg['test_RMSE'], lin_reg['cv_rmse'], lin_reg['R2_train'], lin_reg['R2_test']]},
                                                        index=(['train_MAE','test_MAE', 'train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

evaluation_stats

Training set RMSE: 0.36177540628641097
Test set RMSE: 0.3717311332760197
Training set R2: 0.008920007585137446
Test set R2: 0.010060463583897672


Unnamed: 0,lin_reg
train_MAE,0.261763
test_MAE,0.268943
train_RMSE,0.361775
test_RMSE,0.371731
cv_rmse,0.362387
R2_train,0.00892
R2_test,0.01006


#### DecisionTree Model


In [7]:
dt_reg = DecisionTreeRegression(feats_train, feats_test, yc_train, yc_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(dt_reg = [dt_reg['train_MAE'], dt_reg['test_MAE'], dt_reg['train_RMSE'], dt_reg['test_RMSE'], dt_reg['cv_rmse'], dt_reg['R2_train'], dt_reg['R2_test']])

evaluation_stats



Unnamed: 0,lin_reg,dt_reg
train_MAE,0.261763,6.118743000000001e-17
test_MAE,0.268943,0.2982346
train_RMSE,0.361775,8.382287000000001e-17
test_RMSE,0.371731,0.5461086
cv_rmse,0.362387,0.5297596
R2_train,0.00892,1.0
R2_test,0.01006,-1.136529


#### RandomForest Model


In [8]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
rf_reg = RandomForestRegression(feats_train, feats_test, yc_train, yc_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(rf_reg = [rf_reg['train_MAE'], rf_reg['test_MAE'], rf_reg['train_RMSE'], rf_reg['test_RMSE'], rf_reg['cv_rmse'], rf_reg['R2_train'], rf_reg['R2_test']])

evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647
test_MAE,0.268943,0.2982346,0.266954
train_RMSE,0.361775,8.382287000000001e-17,0.336668
test_RMSE,0.371731,0.5461086,0.374074
cv_rmse,0.362387,0.5297596,0.364464
R2_train,0.00892,1.0,0.141709
R2_test,0.01006,-1.136529,-0.002455


#### XgBoost Model


In [9]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
xgb_reg = XGBoostRegression(feats_train, feats_test, yc_train, yc_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(xgb_reg = [xgb_reg['train_MAE'], xgb_reg['test_MAE'], xgb_reg['train_RMSE'], xgb_reg['test_RMSE'], xgb_reg['cv_rmse'], xgb_reg['R2_train'], xgb_reg['R2_test']])
evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647,0.252313
test_MAE,0.268943,0.2982346,0.266954,0.269216
train_RMSE,0.361775,8.382287000000001e-17,0.336668,0.34941
test_RMSE,0.371731,0.5461086,0.374074,0.372123
cv_rmse,0.362387,0.5297596,0.364464,0.36349
R2_train,0.00892,1.0,0.141709,0.075511
R2_test,0.01006,-1.136529,-0.002455,0.007975


# Feature engineering


### VarianceThreshold


In [10]:
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd

# Feature preprocessing pipeline
# Split into numerical and categorical columns
bool_cols = feats_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = feats_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
col_trans = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

# Scale and remove low-variance features
preprocessor = Pipeline(steps=[
    ('col_trans', col_trans),
    ('var_threshold', VarianceThreshold(threshold=0.1))
])




#### Model(VT)


In [None]:
def scores(model, feats_train, feats_test, yc_train, yc_test):
    # fit the transofrmer on the train data
    model.fit(feats_train, yc_train)

    support = model.regressor_.named_steps['preprocessor'].named_steps['var_threshold'].get_support()
    # print(f"Selected features: {feats_train.columns[support]}")
    print(len(support), len(feats_train.columns))

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(feats_train)
    pred_test = model.predict(feats_test)

    # Evaluate the performance of the model on both sets using the mean absolute error
    train_MAE = mean_absolute_error(yc_train, pred_train)
    test_MAE = mean_absolute_error(yc_test, pred_test)

    # Evaluate the performance of the model on both sets using the mean square error
    train_MSE = mean_squared_error(yc_train, pred_train)
    test_MSE = mean_squared_error(yc_test, pred_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_RMSE = mean_squared_error(yc_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(yc_test, pred_test, squared=False)

    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    # In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(feats_train, yc_train)
    R2_test = model.score(feats_test, yc_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting
    # RMSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))

    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, feats_train, yc_train,
                                    scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE, 'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


##### Linear Model


In [None]:

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# TransformedTargetRegressor for target scaling
model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

# Model predictions
VarT_lin_reg = scores(model, feats_train, feats_test, yc_train, yc_test)
# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_lin_reg =  [VarT_lin_reg['train_MAE'], VarT_lin_reg['test_MAE'],  VarT_lin_reg['train_RMSE'], VarT_lin_reg['test_RMSE'], VarT_lin_reg['cv_rmse'], VarT_lin_reg['R2_train'], VarT_lin_reg['R2_test']])

evaluation_stats

20 19
Training set RMSE: 0.36177540628641097
Test set RMSE: 0.37173113327601964
Training set R2: 0.008920007585137446
Test set R2: 0.010060463583897783


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647,0.252313,0.261763
test_MAE,0.268943,0.2982346,0.266954,0.269216,0.268943
train_RMSE,0.361775,8.382287000000001e-17,0.336668,0.34941,0.361775
test_RMSE,0.371731,0.5461086,0.374074,0.372123,0.371731
cv_rmse,0.362387,0.5297596,0.364464,0.36349,0.362387
R2_train,0.00892,1.0,0.141709,0.075511,0.00892
R2_test,0.01006,-1.136529,-0.002455,0.007975,0.01006


##### DecisionTree Model


In [None]:
# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

# TransformedTargetRegressor for target scaling
dt_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)




VarT_dt_reg = scores(dt_model, feats_train, feats_test, yc_train, yc_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_dt_reg = [VarT_dt_reg['train_MAE'], VarT_dt_reg['test_MAE'], VarT_dt_reg['train_RMSE'], VarT_dt_reg['test_RMSE'], VarT_dt_reg['cv_rmse'], VarT_dt_reg['R2_train'], VarT_dt_reg['R2_test']])

evaluation_stats



20 19
Training set RMSE: 8.343796253561663e-17
Test set RMSE: 0.5449527670220545
Training set R2: 1.0
Test set R2: -1.1274948735475059


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647,0.252313,0.261763,6.291176e-17
test_MAE,0.268943,0.2982346,0.266954,0.269216,0.268943,0.2969735
train_RMSE,0.361775,8.382287000000001e-17,0.336668,0.34941,0.361775,8.343796e-17
test_RMSE,0.371731,0.5461086,0.374074,0.372123,0.371731,0.5449528
cv_rmse,0.362387,0.5297596,0.364464,0.36349,0.362387,0.5296273
R2_train,0.00892,1.0,0.141709,0.075511,0.00892,1.0
R2_test,0.01006,-1.136529,-0.002455,0.007975,0.01006,-1.127495


##### RandomForest Model


In [None]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
                        n_estimators=hyperparameters['n_estimators'],
                        max_depth=hyperparameters['max_depth'],
                        criterion=hyperparameters['criterion'], random_state=18
                        ))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)
# Store the model evaluation details in a DataFrame
VarT_rf_reg = scores(rf_model, feats_train, feats_test, yc_train, yc_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_rf_reg = [VarT_rf_reg['train_MAE'], VarT_rf_reg['test_MAE'], VarT_rf_reg['train_RMSE'], VarT_rf_reg['test_RMSE'], VarT_rf_reg['cv_rmse'], VarT_rf_reg['R2_train'], VarT_rf_reg['R2_test']])
evaluation_stats

20 19
Training set RMSE: 0.33662473171136953
Test set RMSE: 0.37347457286494884
Training set R2: 0.14193009638860232
Test set R2: 0.0007529458307865555


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647,0.252313,0.261763,6.291176e-17,0.240604
test_MAE,0.268943,0.2982346,0.266954,0.269216,0.268943,0.2969735,0.266348
train_RMSE,0.361775,8.382287000000001e-17,0.336668,0.34941,0.361775,8.343796e-17,0.336625
test_RMSE,0.371731,0.5461086,0.374074,0.372123,0.371731,0.5449528,0.373475
cv_rmse,0.362387,0.5297596,0.364464,0.36349,0.362387,0.5296273,0.364487
R2_train,0.00892,1.0,0.141709,0.075511,0.00892,1.0,0.14193
R2_test,0.01006,-1.136529,-0.002455,0.007975,0.01006,-1.127495,0.000753


##### XgBoost Model


In [None]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle'))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

VarT_xgb_reg = scores(rf_model, feats_train, feats_test, yc_train, yc_test)



# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_xgb_reg = [VarT_xgb_reg['train_MAE'], VarT_xgb_reg['test_MAE'], VarT_xgb_reg['train_RMSE'], VarT_xgb_reg['test_RMSE'], VarT_xgb_reg['cv_rmse'], VarT_xgb_reg['R2_train'], VarT_xgb_reg['R2_test']])
evaluation_stats

20 19
Training set RMSE: 0.3493987922037957
Test set RMSE: 0.3723094699972207
Training set R2: 0.0755712839171584
Test set R2: 0.006977785301359285


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg,VarT_xgb_reg
train_MAE,0.261763,6.118743000000001e-17,0.240647,0.252313,0.261763,6.291176e-17,0.240604,0.252298
test_MAE,0.268943,0.2982346,0.266954,0.269216,0.268943,0.2969735,0.266348,0.269218
train_RMSE,0.361775,8.382287000000001e-17,0.336668,0.34941,0.361775,8.343796e-17,0.336625,0.349399
test_RMSE,0.371731,0.5461086,0.374074,0.372123,0.371731,0.5449528,0.373475,0.372309
cv_rmse,0.362387,0.5297596,0.364464,0.36349,0.362387,0.5296273,0.364487,0.363474
R2_train,0.00892,1.0,0.141709,0.075511,0.00892,1.0,0.14193,0.075571
R2_test,0.01006,-1.136529,-0.002455,0.007975,0.01006,-1.127495,0.000753,0.006978
