In [None]:
# PARAMETERS

# which game in the future are you trying to predict? shift_param=1 means the next game (2 means the one after that etc.)
shift_param = 1

In [None]:
# IMPORTS

import pandas as pd
import numpy as np
from pathlib import Path

import catboost 
import shap
import optuna

from sklearn import linear_model
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import plotly.graph_objects as go

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Functions

In [None]:
def calculate_performance_metrics(y_true, y_predicted):
    mae = mean_absolute_error(y_true, y_predicted)
    rmse = mean_squared_error(y_true, y_predicted, squared=False)
    r2 = r2_score(y_true, y_predicted)
    return (mae, rmse, r2)


In [None]:
# DEFINE OPTUNA OBJECTIVE FOR HYPER-PARAMETER OPTIMIZATION
def optuna_objective(trial, model_name, optuna_params, X_train, y_train, folds, test_metric):
    '''
    Objective function for Optuna to optimize.

    Inputs:
        trial: Optuna trial object.
        model_name (str): String to specify which model type to optimize. Current options: {catboost}.
        optuna_params (dict): Optuna-specific parameters. 
        X_train (array-like): Training data inputs.
        Y_train (array-like): Training data target values.
        folds (sklearn KFold): Cross-validation folds.
        test_metric (str): Chosen metric for evaluating model performance in the test set.

    Output:
        best_result (float): Mean of the test-metric across cross-validation folds.
    '''

    if model_name=='catboost':   
        params = {}
        params['learning_rate'] = trial.suggest_float('learning_rate', 
                                                    optuna_params['cat_learning_rate_bounds'][0],
                                                    optuna_params['cat_learning_rate_bounds'][1],
                                                    log=True)
        params['depth'] = trial.suggest_int('depth', optuna_params['cat_depth_bounds'][0], 
                                                optuna_params['cat_depth_bounds'][1])
        params['objective'] = trial.suggest_categorical('objective', optuna_params['cat_objective_list'])
        params['eval_metric'] = test_metric
        
        cv_data = catboost.Pool(
            data=X_train,
            label=y_train,
        )

        cv_df = catboost.cv(
            pool=cv_data,
            params=params,
            folds=folds,
            num_boost_round=1000, 
            early_stopping_rounds=50, 
            #verbose_eval=False,
            logging_level='Silent',
            shuffle=False,
        )
        
        best_result = cv_df.sort_values(f'test-{test_metric}-mean')[f'test-{test_metric}-mean'].iloc[0]        

    else:
        print('Incorrect model name!')
        return
    
    return best_result

In [None]:
def cat_optuna(X_train, y_train, X_test, y_test, cat_optuna_params, 
               best_params, metrics, folds, test_metric, n_optimization_trials=100):
    '''
    Performs hyperparameter optimization for the CatBoost regressor using Optuna.

    Input:
        X_train (array-like): The training input samples.
        Y_train (array-like): The target values for the training set.
        X_test (array-like): The test input samples.
        Y_test (array-like): The target values for the test set.
        cat_optuna_params (dict): Parameters for Optuna optimization.
        best_params (dict): A dictionary to store the best hyperparameters found during optimization.
        metrics (dict): A dictionary for saving model performance results.
        model_spec (dict): A dictionary defining inputs, the target and the training and test periods.
        n_optimization_trials (int): Number of optimization trials to be performed with Optuna.
        
    Output:
        metrics (dict): A dictionary for saving model performance results.
        best_params (dict): A dictionary to store the best hyperparameters found during optimization.
    '''

    study = optuna.create_study() 
    study.optimize(lambda trial: optuna_objective(trial, 
                                                model_name = 'catboost', 
                                                optuna_params = cat_optuna_params,
                                                X_train = X_train,
                                                y_train = y_train, 
                                                folds = folds, 
                                                test_metric = test_metric,
                                                ), 
                    n_trials=n_optimization_trials,)
    
    print('Best params:')
    print(study.best_params)
    print(f'Training {test_metric} for best params:')
    print(study.best_value)

    # Visualize optimization run
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()

    # Redo cross-validation with best params to get the optimal n_estimators
    cv_data = catboost.Pool(
            data=X_train,
            label=y_train,
        )
    
    cat_params={}
    cat_params['learning_rate'] = study.best_params['learning_rate']
    cat_params['depth'] = study.best_params['depth']
    cat_params['objective'] = study.best_params['objective']
    cat_params['eval_metric'] = test_metric

    cv_df = catboost.cv(
            pool=cv_data,
            params=cat_params,
            folds=folds,
            num_boost_round=1000, 
            early_stopping_rounds=50, 
            #verbose_eval=False,
            logging_level='Silent',
            shuffle=False,
        ) 
    nr_iterations = cv_df.sort_values(f'test-{test_metric}-mean')['iterations'].iloc[0] + 1

    # train best catboost model with whole training data
    cat_model = catboost.CatBoostRegressor(iterations=nr_iterations, verbose=False, **cat_params)
    cat_model.fit(X_train, y_train)

    # test predictions
    y_predicted = pd.Series(
        cat_model.predict(X_test),
        index=X_test.index
    )

    print('CATBOOST')
    metrics["test"] = calculate_performance_metrics(y_test, y_predicted)
    best_params["catboost"] = study.best_params
    best_params["catboost"]['nr_estimators'] = nr_iterations
    
    explainer = shap.Explainer(cat_model)
    shap_values = explainer(X_train.sample(10000, random_state=42) if len(X_train)>10000 else X_train);
    shap.plots.bar(shap_values, max_display=10)
    
    print()

    return metrics, best_params, cat_model, shap_values, study

# Data processing

In [None]:
# fetch data
filepath = Path('../../data/modelling/fpl_df.csv')
df = pd.read_csv(filepath, index_col=0)
display(df.head())
display(df.shape)

In [None]:
features_no_shift = ['element_type']

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

target = ['event_points']

In [None]:
# shift give features
df[features_shift] = df.groupby('web_name')[features_shift].shift(shift_param)
display(df.head())
display(df.tail())
display(df.shape)

In [None]:
df[df.web_name=='Kane']

In [None]:
df.isnull().sum() / df.shape[0]

In [None]:
(df.isnull().sum(axis=1) > 4).sum() / df.shape[0]

In [None]:
# drop rows where too much data missing
df = df[df.isnull().sum(axis=1) <= 4].reset_index(drop=True)
display(df.shape)

In [None]:
X = df[features_no_shift + features_shift].copy()
y = df[target].copy()

display(X.shape)
display(y.shape)

# Split data to train and test sets

In [None]:
# Proportion of season 22-23 data relative to all data
df[df.season=='22-23'].shape[0] / df.shape[0]

Use season 22-23 for testing, rest for training.

In [None]:
train_ix = df[df.season!='22-23'].index
test_ix = df[df.season=='22-23'].index
print(f'Train data size: {len(train_ix)}')
print(f'Test data size: {len(test_ix)}')

In [None]:
display(train_ix)
display(test_ix)

In [None]:
X_train = X.loc[train_ix]
X_test = X.loc[test_ix]
y_train = y.loc[train_ix]
y_test = y.loc[test_ix]

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Baseline models

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['points_per_game'])
test_metrics = calculate_performance_metrics(y_test, X_test['points_per_game'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_5'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_5'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_10'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_10'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_20'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_20'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_40'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_40'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

# Ridge regression

In [None]:
model = Pipeline([
                ('scaler', StandardScaler()),               
                ('mean_imputer', SimpleImputer(strategy='mean',)),
                ('regression', linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13), 
                                                cv=KFold(n_splits=4, shuffle=False))),
        ])

model.fit(X_train, y_train.values.reshape(-1, 1))

In [None]:
y_pred = model.predict(X_test)
test_metrics = calculate_performance_metrics(y_test, y_pred)
results = pd.DataFrame(np.array([test_metrics]), index=['test'], columns=('MAE', 'RMSE', 'r2'))
results

# Random forest

# CatBoost

In [None]:
# TEST DEFAULT PARAMS

cat_model = catboost.CatBoostRegressor()
cat_model.fit(X_train, y_train)

# test predictions
y_predicted = pd.Series(
    cat_model.predict(X_test),
    index=X_test.index
)

test_metrics = calculate_performance_metrics(y_test, y_predicted)
results = pd.DataFrame(np.array([test_metrics]), index=['test'], columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
# PARAMETERS

cat_optuna_params = {}
cat_optuna_params['cat_learning_rate_bounds'] = [0.001, 0.5]
cat_optuna_params['cat_depth_bounds'] = [3,10]
cat_optuna_params['cat_objective_list'] = ['RMSE','MAE']

In [None]:
# OPTIMIZATION
n_trials = 2

folds = KFold(n_splits=4, shuffle=False) 
test_metric='RMSE'

best_params = {}
metrics = {}
metrics, best_params, cat_model, shap_values, study = cat_optuna(X_train, y_train, X_test, y_test, 
               cat_optuna_params, best_params, metrics, folds, test_metric, n_optimization_trials=n_trials)

print('OPTIMIZATION TRIALS DATA')
test_scores = [study.get_trials()[i].values[0] for i in range(0,len(study.get_trials()))]
params = [pd.DataFrame(study.get_trials()[i].params, index=[i]) for i in range(0,len(study.get_trials()))]
trial_data = pd.concat(params)
trial_data[f'test {test_metric}'] = test_scores
display(trial_data)

print('BEST PARAMETERS TEST SCORES')
results = pd.DataFrame(np.array([metrics['test']]), index=['test'], columns=('MAE', 'RMSE', 'r2'))
display(results)

In [None]:
test_scores = [study.get_trials()[i].values[0] for i in range(0,len(study.get_trials()))]
params = [pd.DataFrame(study.get_trials()[i].params, index=[i]) for i in range(0,len(study.get_trials()))]
trial_data = pd.concat(params)
trial_data[f'test {test_metric}'] = test_scores
display(trial_data)


In [None]:
pd.concat(params)

In [None]:
study.get_trials()[0].params