In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
%run ./ml_model_init.ipynb

## Baseline Model


In [4]:

xa_data= data[(data['minutes_5'] >= 300) & (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3', 'expected_assists_5', 'whh', 'whd', 'wha']]

xa_data_tar = data_tar[(data_tar['minutes_5'] >= 300) & (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5','was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3',
                    'expected_assists_5', 'whh', 'whd', 'wha']]



In [7]:

xa = xa_data[['expected_assists']]
feats = xa_data.drop('expected_assists', axis=1)
feats_train, feats_test, xa_train, xa_test = train_test_split(feats, xa , test_size=0.2, random_state=42)

#### Linear Model


In [8]:

lin_reg = Linear_regression(feats_train, feats_test, xa_train, xa_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = pd.DataFrame({"lin_reg": [lin_reg['train_MAE'], lin_reg['test_MAE'], lin_reg['train_RMSE'], lin_reg['test_RMSE'], lin_reg['cv_rmse'], lin_reg['R2_train'], lin_reg['R2_test']]},
                                                        index=(['train_MAE','test_MAE', 'train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

evaluation_stats

Training set RMSE: 0.1266742071679424
Test set RMSE: 0.14692623714889996
Training set R2: 0.1577771611469324
Test set R2: 0.16303631470072555


Unnamed: 0,lin_reg
train_MAE,0.079274
test_MAE,0.086561
train_RMSE,0.126674
test_RMSE,0.146926
cv_rmse,0.126988
R2_train,0.157777
R2_test,0.163036


#### DecisionTree Model


In [9]:
dt_reg = DecisionTreeRegression(feats_train, feats_test, xa_train, xa_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(dt_reg = [dt_reg['train_MAE'], dt_reg['test_MAE'], dt_reg['train_RMSE'], dt_reg['test_RMSE'], dt_reg['cv_rmse'], dt_reg['R2_train'], dt_reg['R2_test']])

evaluation_stats



Unnamed: 0,lin_reg,dt_reg
train_MAE,0.079274,3.2919260000000002e-18
test_MAE,0.086561,0.1154098
train_RMSE,0.126674,9.153437e-18
test_RMSE,0.146926,0.2009834
cv_rmse,0.126988,0.186121
R2_train,0.157777,1.0
R2_test,0.163036,-0.5661316


#### RandomForest Model


In [10]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
rf_reg = RandomForestRegression(feats_train, feats_test, xa_train, xa_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(rf_reg = [rf_reg['train_MAE'], rf_reg['test_MAE'], rf_reg['train_RMSE'], rf_reg['test_RMSE'], rf_reg['cv_rmse'], rf_reg['R2_train'], rf_reg['R2_test']])

evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212
test_MAE,0.086561,0.1154098,0.087604
train_RMSE,0.126674,9.153437e-18,0.105411
test_RMSE,0.146926,0.2009834,0.148274
cv_rmse,0.126988,0.186121,0.129732
R2_train,0.157777,1.0,0.416792
R2_test,0.163036,-0.5661316,0.147613


#### XgBoost Model


In [11]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
xgb_reg = XGBoostRegression(feats_train, feats_test, xa_train, xa_test, hyperparameters)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(xgb_reg = [xgb_reg['train_MAE'], xgb_reg['test_MAE'], xgb_reg['train_RMSE'], xgb_reg['test_RMSE'], xgb_reg['cv_rmse'], xgb_reg['R2_train'], xgb_reg['R2_test']])
evaluation_stats

Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212,0.075582
test_MAE,0.086561,0.1154098,0.087604,0.086849
train_RMSE,0.126674,9.153437e-18,0.105411,0.11862
test_RMSE,0.146926,0.2009834,0.148274,0.147311
cv_rmse,0.126988,0.186121,0.129732,0.127439
R2_train,0.157777,1.0,0.416792,0.26147
R2_test,0.163036,-0.5661316,0.147613,0.158644


# Feature engineering


### VarianceThreshold


In [12]:
# Split into numerical and categorical columns
bool_cols = feats_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = feats_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

# Scale and remove low-variance features
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('var_threshold', VarianceThreshold(threshold=0.1))
])

# Apply the pipeline to the training data
pipeline.fit(feats_train)
preprocessed_train = pipeline.transform(feats_train)
preprocessed_test = pipeline.transform(feats_test)


# # VT_scaler = StandardScaler()
# train_scaled = pd.DataFrame(VT_scaler.fit_transform(feats_train), columns=feats_train.columns)
# test_scaled = pd.DataFrame(VT_scaler.transform(feats_test), columns=feats_test.columns)

# # selector = VarianceThreshold(threshold = 0.1)
# # selector.fit_transform(df_for_train_scaled)

# # for_threshold_columns = df_for_train_scaled.columns[selector.get_support()]

# # for_threshold_train = df_for_train_scaled[for_threshold_columns]
# # for_threshold_test = df_for_test_scaled[for_threshold_columns]

# # for_threshold_train.shape, for_threshold_test.shape


columns=feats_train.columns
train_scaled = pd.DataFrame(preprocessed_train)
test_scaled = pd.DataFrame(preprocessed_test)

# # selector = VarianceThreshold(threshold = 0.1)
# # selector.fit_transform(train_scaled)

# # for_threshold_columns = train_scaled.columns[selector.get_support()]

# # for_threshold_train = train_scaled[for_threshold_columns]
# # for_threshold_test = test_scaled[for_threshold_columns]

# # for_threshold_train.shape, for_threshold_test.shape
# preprocessed_train
# # print(preprocessed_train.shape, preprocessed_test.shape)
train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.801413,1.187325,0.307868,0.945016,-0.352707,-0.547552,0.965375,0.953965,-0.752291,-0.963706,...,-0.645971,-0.819601,-0.788172,-0.849969,0.361531,0.158689,-0.405807,1.0,0.0,0.0
1,0.801413,-0.815882,0.307868,-0.968246,-0.926488,-1.233359,-0.796929,-1.188621,-0.588584,-0.820251,...,0.520671,0.006540,-0.280020,-0.569443,-0.582818,0.574902,0.407225,0.0,0.0,1.0
2,0.801413,1.187325,-0.763757,-0.011615,-0.489916,-0.539085,0.581371,0.534544,-1.025135,-0.969153,...,-0.788825,-0.887041,-0.822048,-0.873347,-0.284602,0.783008,0.082013,1.0,0.0,0.0
3,0.801413,1.187325,-0.763757,1.901647,-0.203026,-0.496751,0.848802,0.504224,-0.700450,-0.814804,...,-0.693589,-0.853321,-0.686541,-0.662952,2.051418,-2.546696,-1.489849,1.0,0.0,0.0
4,-0.496284,-0.444153,-0.763757,-0.968246,0.283440,-0.395150,-0.220923,-0.946064,-0.231159,-0.611426,...,-0.503117,-0.718441,-0.618788,-0.803215,-0.483413,1.407328,0.136215,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6337,-1.690166,-0.795231,0.307868,-0.968246,-0.627124,-0.073413,-1.174076,-0.445791,-0.438520,-0.660455,...,-0.312644,-0.583561,-0.313897,-0.499311,-0.731925,0.574902,0.732438,1.0,0.0,0.0
6338,-1.793982,-1.641947,-0.763757,1.901647,-0.564757,-0.462884,-1.249505,-1.466551,0.251775,0.142162,...,-0.526926,-0.465541,-0.652664,-0.662952,2.200526,-2.962909,-1.652456,0.0,0.0,1.0
6339,0.801413,1.187325,-0.763757,-0.011615,0.083865,-0.251215,0.725373,0.519384,0.478236,0.069527,...,-0.193599,-0.364380,-0.313897,-0.522688,-0.085792,0.991115,-0.243200,0.0,0.0,1.0
6340,0.801413,1.187325,-0.763757,1.901647,-0.527336,-0.242748,-0.982074,-0.728774,0.418210,0.852169,...,0.687334,0.815821,0.533022,0.879945,2.051418,-2.754803,-1.598253,0.0,0.0,1.0


In [13]:
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd

# Feature preprocessing pipeline
# Split into numerical and categorical columns
bool_cols = feats_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = feats_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
col_trans = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

# Scale and remove low-variance features
preprocessor = Pipeline(steps=[
    ('col_trans', col_trans),
    ('var_threshold', VarianceThreshold(threshold=0.1))
])

# Apply the pipeline to the training data
# pipeline.fit(feats_train)
# preprocessed_train = pipeline.transform(feats_train)
# preprocessed_test = pipeline.transform(feats_test)


# preprocessor = Pipeline(steps=[
#     ('scaler', StandardScaler()),
#     ('var_threshold', VarianceThreshold(threshold=0.1))
# ])




#### Model(VT)


In [None]:
def scores(model, feats_train, feats_test, xa_train, xa_test):
    # fit the transofrmer on the train data
    model.fit(feats_train, xa_train)

    support = model.regressor_.named_steps['preprocessor'].named_steps['var_threshold'].get_support()
    # print(f"Selected features: {feats_train.columns[support]}")
    print(len(support), len(feats_train.columns))

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(feats_train)
    pred_test = model.predict(feats_test)

    # Evaluate the performance of the model on both sets using the mean absolute error
    train_MAE = mean_absolute_error(xa_train, pred_train)
    test_MAE = mean_absolute_error(xa_test, pred_test)

    # Evaluate the performance of the model on both sets using the mean square error
    train_MSE = mean_squared_error(xa_train, pred_train)
    test_MSE = mean_squared_error(xa_test, pred_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_RMSE = mean_squared_error(xa_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(xa_test, pred_test, squared=False)

    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    # In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(feats_train, xa_train)
    R2_test = model.score(feats_test, xa_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting
    # RMSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))

    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, feats_train, xa_train,
                                    scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE, 'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


##### Linear Model


In [None]:

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# TransformedTargetRegressor for target scaling
model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

# Model predictions
VarT_lin_reg = scores(model, feats_train, feats_test, xa_train, xa_test)
# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_lin_reg =  [VarT_lin_reg['train_MAE'], VarT_lin_reg['test_MAE'],  VarT_lin_reg['train_RMSE'], VarT_lin_reg['test_RMSE'], VarT_lin_reg['cv_rmse'], VarT_lin_reg['R2_train'], VarT_lin_reg['R2_test']])

evaluation_stats

22 21
Training set RMSE: 0.1266742071679424
Test set RMSE: 0.14692623714889996
Training set R2: 0.1577771611469324
Test set R2: 0.16303631470072555


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212,0.075582,0.079274
test_MAE,0.086561,0.1154098,0.087604,0.086849,0.086561
train_RMSE,0.126674,9.153437e-18,0.105411,0.11862,0.126674
test_RMSE,0.146926,0.2009834,0.148274,0.147311,0.146926
cv_rmse,0.126988,0.186121,0.129732,0.127439,0.126988
R2_train,0.157777,1.0,0.416792,0.26147,0.157777
R2_test,0.163036,-0.5661316,0.147613,0.158644,0.163036


##### DecisionTree Model


In [None]:
# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

# TransformedTargetRegressor for target scaling
dt_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)




VarT_dt_reg = scores(dt_model, feats_train, feats_test, xa_train, xa_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_dt_reg = [VarT_dt_reg['train_MAE'], VarT_dt_reg['test_MAE'], VarT_dt_reg['train_RMSE'], VarT_dt_reg['test_RMSE'], VarT_dt_reg['cv_rmse'], VarT_dt_reg['R2_train'], VarT_dt_reg['R2_test']])

evaluation_stats



22 21
Training set RMSE: 9.098948518184233e-18
Test set RMSE: 0.1953533352645692
Training set R2: 1.0
Test set R2: -0.479617853905266


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212,0.075582,0.079274,3.247068e-18
test_MAE,0.086561,0.1154098,0.087604,0.086849,0.086561,0.1131021
train_RMSE,0.126674,9.153437e-18,0.105411,0.11862,0.126674,9.098949000000001e-18
test_RMSE,0.146926,0.2009834,0.148274,0.147311,0.146926,0.1953533
cv_rmse,0.126988,0.186121,0.129732,0.127439,0.126988,0.1859547
R2_train,0.157777,1.0,0.416792,0.26147,0.157777,1.0
R2_test,0.163036,-0.5661316,0.147613,0.158644,0.163036,-0.4796179


##### RandomForest Model


In [None]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
                        n_estimators=hyperparameters['n_estimators'],
                        max_depth=hyperparameters['max_depth'],
                        criterion=hyperparameters['criterion'], random_state=18
                        ))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)
# Store the model evaluation details in a DataFrame
VarT_rf_reg = scores(rf_model, feats_train, feats_test, xa_train, xa_test)

# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_rf_reg = [VarT_rf_reg['train_MAE'], VarT_rf_reg['test_MAE'], VarT_rf_reg['train_RMSE'], VarT_rf_reg['test_RMSE'], VarT_rf_reg['cv_rmse'], VarT_rf_reg['R2_train'], VarT_rf_reg['R2_test']])
evaluation_stats

22 21
Training set RMSE: 0.10536218175831404
Test set RMSE: 0.14827729367069528
Training set R2: 0.4173333854588386
Test set R2: 0.14757298669030539


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212,0.075582,0.079274,3.247068e-18,0.069181
test_MAE,0.086561,0.1154098,0.087604,0.086849,0.086561,0.1131021,0.087674
train_RMSE,0.126674,9.153437e-18,0.105411,0.11862,0.126674,9.098949000000001e-18,0.105362
test_RMSE,0.146926,0.2009834,0.148274,0.147311,0.146926,0.1953533,0.148277
cv_rmse,0.126988,0.186121,0.129732,0.127439,0.126988,0.1859547,0.129628
R2_train,0.157777,1.0,0.416792,0.26147,0.157777,1.0,0.417333
R2_test,0.163036,-0.5661316,0.147613,0.158644,0.163036,-0.4796179,0.147573


##### XgBoost Model


In [None]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}

# Full pipeline including the regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle'))
])

# TransformedTargetRegressor for target scaling
rf_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=StandardScaler()
)

VarT_xgb_reg = scores(rf_model, feats_train, feats_test, xa_train, xa_test)



# Store the model evaluation details in a DataFrame
evaluation_stats = evaluation_stats.assign(VarT_xgb_reg = [VarT_xgb_reg['train_MAE'], VarT_xgb_reg['test_MAE'], VarT_xgb_reg['train_RMSE'], VarT_xgb_reg['test_RMSE'], VarT_xgb_reg['cv_rmse'], VarT_xgb_reg['R2_train'], VarT_xgb_reg['R2_test']])
evaluation_stats

22 21
Training set RMSE: 0.11869160138627577
Test set RMSE: 0.1473067354844204
Training set R2: 0.26058100446451715
Test set R2: 0.1586956923411792


Unnamed: 0,lin_reg,dt_reg,rf_reg,xgb_reg,VarT_lin_reg,VarT_dt_reg,VarT_rf_reg,VarT_xgb_reg
train_MAE,0.079274,3.2919260000000002e-18,0.069212,0.075582,0.079274,3.247068e-18,0.069181,0.07556
test_MAE,0.086561,0.1154098,0.087604,0.086849,0.086561,0.1131021,0.087674,0.086822
train_RMSE,0.126674,9.153437e-18,0.105411,0.11862,0.126674,9.098949000000001e-18,0.105362,0.118692
test_RMSE,0.146926,0.2009834,0.148274,0.147311,0.146926,0.1953533,0.148277,0.147307
cv_rmse,0.126988,0.186121,0.129732,0.127439,0.126988,0.1859547,0.129628,0.127438
R2_train,0.157777,1.0,0.416792,0.26147,0.157777,1.0,0.417333,0.260581
R2_test,0.163036,-0.5661316,0.147613,0.158644,0.163036,-0.4796179,0.147573,0.158696
