In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
%run ./ml_model_init.ipynb

## Baseline Model


In [3]:
#
sv_data= data[(data['minutes_5'] >= 300) & (data['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5', 'saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]

sv_data_tar = data_tar[(data_tar['minutes_5'] >= 300) & (data_tar['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5', 'saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]

# players = data_225 #.drop('position', axis=1) #  & (data['position'] == 'FWD')
# # players


In [4]:

sv = sv_data[['saves']]
feats = sv_data.drop('saves', axis=1)
feats_train, feats_test, sv_train, sv_test = train_test_split(feats, sv , test_size=0.2, random_state=42)

In [5]:
# Split the gk data into train and test sets
# for_splits = split_data(fwd_player_data)

#### Linear Model


In [6]:

lin_reg = Linear_regression(feats_train, feats_test, sv_train, sv_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = pd.DataFrame({"lin_reg": [lin_reg['train_MAE'], lin_reg['test_MAE'], lin_reg['train_RMSE'], lin_reg['test_RMSE'], lin_reg['cv_rmse'], lin_reg['R2_train'], lin_reg['R2_test']]},
                                                        index=(['train_MAE','test_MAE', 'train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

evaluation_stats

Training set RMSE: 1.9133430530505469
Test set RMSE: 2.0881089310854497
Training set R2: 0.05128844815360334
Test set R2: -0.03761861351636875


Unnamed: 0,lin_reg
train_MAE,1.536606
test_MAE,1.589365
train_RMSE,1.913343
test_RMSE,2.088109
cv_rmse,1.951444
R2_train,0.051288
R2_test,-0.037619


#### DecisionTree Model


In [7]:
dt_reg = DecisionTreeRegression(feats_train, feats_test, sv_train, sv_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(dt_reg = [dt_reg['train_MAE'], dt_reg['test_MAE'], dt_reg['train_RMSE'], dt_reg['test_RMSE'], dt_reg['cv_rmse'], dt_reg['R2_train'], dt_reg['R2_test']])

evaluation_stats



Unnamed: 0,lin_reg,dt_reg
train_MAE,1.536606,1.5659e-18
test_MAE,1.589365,2.123596
train_RMSE,1.913343,1.8646710000000002e-17
test_RMSE,2.088109,2.792446
cv_rmse,1.951444,2.851261
R2_train,0.051288,1.0
R2_test,-0.037619,-0.8556707


#### RandomForest Model


In [8]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
rf_reg = RandomForestRegression(feats_train, feats_test, sv_train, sv_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(rf_reg = [rf_reg['train_MAE'], rf_reg['test_MAE'], rf_reg['train_RMSE'], rf_reg['test_RMSE'], rf_reg['cv_rmse'], rf_reg['R2_train'], rf_reg['R2_test']])

evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg
train_MAE,1.536606,1.5659e-18,1.181917
test_MAE,1.589365,2.123596,1.586407
train_RMSE,1.913343,1.8646710000000002e-17,1.460013
test_RMSE,2.088109,2.792446,2.089285
cv_rmse,1.951444,2.851261,1.954339
R2_train,0.051288,1.0,0.44759
R2_test,-0.037619,-0.8556707,-0.038787


#### XgBoost Model


In [9]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
xgb_reg = XGBoostRegression(feats_train, feats_test, sv_train, sv_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(xgb_reg = [xgb_reg['train_MAE'], xgb_reg['test_MAE'], xgb_reg['train_RMSE'], xgb_reg['test_RMSE'], xgb_reg['cv_rmse'], xgb_reg['R2_train'], xgb_reg['R2_test']])
evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg
train_MAE,1.536606,1.5659e-18,1.181917,1.297228
test_MAE,1.589365,2.123596,1.586407,1.559228
train_RMSE,1.913343,1.8646710000000002e-17,1.460013,1.625432
test_RMSE,2.088109,2.792446,2.089285,2.063509
cv_rmse,1.951444,2.851261,1.954339,1.967945
R2_train,0.051288,1.0,0.44759,0.315322
R2_test,-0.037619,-0.8556707,-0.038787,-0.013315


# Feature engineering


### VarianceThreshold


In [10]:
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd

# Feature preprocessing pipeline
# Split into numerical and categorical columns
bool_cols = feats_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = feats_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
col_trans = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

# Scale and remove low-variance features
preprocessor = Pipeline(steps=[
    ('col_trans', col_trans),
    ('var_threshold', VarianceThreshold(threshold=0.1))
])




#### Model(VT)


In [None]:
def scores(model, feats_train, feats_test, sv_train, sv_test):
    # fit the transofrmer on the train data
    model.fit(feats_train, sv_train)

    support = model.regressor_.named_steps['preprocessor'].named_steps['var_threshold'].get_support()
    # print(f"Selected features: {feats_train.columns[support]}")
    print(len(support), len(feats_train.columns))

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(feats_train)
    pred_test = model.predict(feats_test)

    # Evaluate the performance of the model on both sets using the mean absolute error
    train_MAE = mean_absolute_error(sv_train, pred_train)
    test_MAE = mean_absolute_error(sv_test, pred_test)

    # Evaluate the performance of the model on both sets using the mean square error
    train_MSE = mean_squared_error(sv_train, pred_train)
    test_MSE = mean_squared_error(sv_test, pred_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_RMSE = mean_squared_error(sv_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(sv_test, pred_test, squared=False)

    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    # In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(feats_train, sv_train)
    R2_test = model.score(feats_test, sv_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting
    # RMSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))

    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, feats_train, sv_train,
                                    scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE, 'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


##### Linear Model


In [None]:

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# TransformedTargetRegressor for target scaling
model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

# Model predictions
VarT_lin_reg = scores(model, feats_train, feats_test, sv_train, sv_test)
# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_lin_reg =  [VarT_lin_reg['train_MAE'], VarT_lin_reg['test_MAE'],  VarT_lin_reg['train_RMSE'], VarT_lin_reg['test_RMSE'], VarT_lin_reg['cv_rmse'], VarT_lin_reg['R2_train'], VarT_lin_reg['R2_test']])

evaluation_stats

18 19
Training set RMSE: 1.9133430530505469
Test set RMSE: 2.0881089310854497
Training set R2: 0.05128844815360334
Test set R2: -0.037618613516368526


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg
train_MAE,1.536606,1.5659e-18,1.181917,1.297228,1.536606
test_MAE,1.589365,2.123596,1.586407,1.559228,1.589365
train_RMSE,1.913343,1.8646710000000002e-17,1.460013,1.625432,1.913343
test_RMSE,2.088109,2.792446,2.089285,2.063509,2.088109
cv_rmse,1.951444,2.851261,1.954339,1.967945,1.951444
R2_train,0.051288,1.0,0.44759,0.315322,0.051288
R2_test,-0.037619,-0.8556707,-0.038787,-0.013315,-0.037619


##### DecisionTree Model


In [None]:
# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

# TransformedTargetRegressor for target scaling
dt_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)




VarT_dt_reg = scores(dt_model, feats_train, feats_test, sv_train, sv_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_dt_reg = [VarT_dt_reg['train_MAE'], VarT_dt_reg['test_MAE'], VarT_dt_reg['train_RMSE'], VarT_dt_reg['test_RMSE'], VarT_dt_reg['cv_rmse'], VarT_dt_reg['R2_train'], VarT_dt_reg['R2_test']])

evaluation_stats



18 19
Training set RMSE: 1.8646705426058906e-17
Test set RMSE: 2.809494336525043
Training set R2: 1.0
Test set R2: -0.8783986780832209


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg
train_MAE,1.536606,1.5659e-18,1.181917,1.297228,1.536606,1.5659e-18
test_MAE,1.589365,2.123596,1.586407,1.559228,1.589365,2.117978
train_RMSE,1.913343,1.8646710000000002e-17,1.460013,1.625432,1.913343,1.8646710000000002e-17
test_RMSE,2.088109,2.792446,2.089285,2.063509,2.088109,2.809494
cv_rmse,1.951444,2.851261,1.954339,1.967945,1.951444,2.803004
R2_train,0.051288,1.0,0.44759,0.315322,0.051288,1.0
R2_test,-0.037619,-0.8556707,-0.038787,-0.013315,-0.037619,-0.8783987


##### RandomForest Model


In [None]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
                        n_estimators=hyperparameters['n_estimators'],
                        max_depth=hyperparameters['max_depth'],
                        criterion=hyperparameters['criterion'], random_state=18
                        ))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)
# Store the model evaluation details in a DataFrame
VarT_rf_reg = scores(rf_model, feats_train, feats_test, sv_train, sv_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_rf_reg = [VarT_rf_reg['train_MAE'], VarT_rf_reg['test_MAE'], VarT_rf_reg['train_RMSE'], VarT_rf_reg['test_RMSE'], VarT_rf_reg['cv_rmse'], VarT_rf_reg['R2_train'], VarT_rf_reg['R2_test']])
evaluation_stats

18 19
Training set RMSE: 1.463619161821428
Test set RMSE: 2.088110549152591
Training set R2: 0.4448572476016084
Test set R2: -0.03762022160990153


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg
train_MAE,1.536606,1.5659e-18,1.181917,1.297228,1.536606,1.5659e-18,1.184535
test_MAE,1.589365,2.123596,1.586407,1.559228,1.589365,2.117978,1.584379
train_RMSE,1.913343,1.8646710000000002e-17,1.460013,1.625432,1.913343,1.8646710000000002e-17,1.463619
test_RMSE,2.088109,2.792446,2.089285,2.063509,2.088109,2.809494,2.088111
cv_rmse,1.951444,2.851261,1.954339,1.967945,1.951444,2.803004,1.958046
R2_train,0.051288,1.0,0.44759,0.315322,0.051288,1.0,0.444857
R2_test,-0.037619,-0.8556707,-0.038787,-0.013315,-0.037619,-0.8783987,-0.03762


##### XgBoost Model


In [None]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle'))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

VarT_xgb_reg = scores(rf_model, feats_train, feats_test, sv_train, sv_test)



# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_xgb_reg = [VarT_xgb_reg['train_MAE'], VarT_xgb_reg['test_MAE'], VarT_xgb_reg['train_RMSE'], VarT_xgb_reg['test_RMSE'], VarT_xgb_reg['cv_rmse'], VarT_xgb_reg['R2_train'], VarT_xgb_reg['R2_test']])
evaluation_stats

18 19
Training set RMSE: 1.6254320961538076
Test set RMSE: 2.0635094379284618
Training set R2: 0.315322331170406
Test set R2: -0.01331476576612678


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg,VarT_xgb_reg
train_MAE,1.536606,1.5659e-18,1.181917,1.297228,1.536606,1.5659e-18,1.184535,1.297228
test_MAE,1.589365,2.123596,1.586407,1.559228,1.589365,2.117978,1.584379,1.559228
train_RMSE,1.913343,1.8646710000000002e-17,1.460013,1.625432,1.913343,1.8646710000000002e-17,1.463619,1.625432
test_RMSE,2.088109,2.792446,2.089285,2.063509,2.088109,2.809494,2.088111,2.063509
cv_rmse,1.951444,2.851261,1.954339,1.967945,1.951444,2.803004,1.958046,1.967945
R2_train,0.051288,1.0,0.44759,0.315322,0.051288,1.0,0.444857,0.315322
R2_test,-0.037619,-0.8556707,-0.038787,-0.013315,-0.037619,-0.8783987,-0.03762,-0.013315
