In [None]:
latest_gameweek = 0
shift_param = 1

In [None]:
import pandas as pd
import numpy as np
import optuna
import shap
from pathlib import Path
import datetime as dt
import json
import pickle

In [None]:
from src.model_training import cross_validation, optuna_objective_xgboost, plot_optuna_study, train_xgboost

In [None]:
# fetch data
filepath = Path('../data/modeling/fpl_df.csv')
fpl_df = pd.read_csv(filepath, index_col=0, low_memory=False)
fpl_df['data_retrieved_datetime'] = pd.to_datetime(fpl_df['data_retrieved_datetime'])
display(fpl_df.head())
display(fpl_df.shape)

In [None]:
fpl_df.season.value_counts()

In [None]:
features_no_shift = ['element_type', 'home', 'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', ]

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

target = ['event_points']

In [None]:
# shift given features
df = fpl_df.copy()
df[features_shift] = df.groupby(['first_name', 'second_name'])[features_shift].shift(shift_param)
display(df.head())
display(df.tail())
display(df.shape)

In [None]:
df.isnull().sum(axis=1).plot(kind='hist')

In [None]:
# drop rows where too much data missing
df = df[df.isnull().sum(axis=1) <= 90].reset_index(drop=True)
display(df.shape)

## Train-test split

In [None]:
train_index = df[~(df.data_retrieved_datetime>'1-1-2024')].index
display(train_index)
test_index = df[(df.data_retrieved_datetime>'1-1-2024')].index
display(test_index)

In [None]:
X = df[features_no_shift+features_shift].copy()
y = df[target].copy()
X_train = df.loc[train_index, features_no_shift+features_shift].copy()
y_train = df.loc[train_index, target].copy()
X_test = df.loc[test_index, features_no_shift+features_shift].copy()
y_test = df.loc[test_index, target].copy()

## Hyper-parameter optimization

In [None]:
optuna_bounds = {}
optuna_bounds['learning_rate'] = [0.001, 0.1]
optuna_bounds['max_depth'] = [4, 10]
optuna_bounds['min_child_weight'] = [0.01, 1]
optuna_bounds['gamma'] = [0, 0.5]
optuna_bounds['subsample'] = [0.5, 1]
optuna_bounds['colsample_bytree'] = [0.5, 1]

In [None]:
# choose max number of trials for optimization
n_optimization_trials = 200

study = optuna.create_study()
study.optimize(
    lambda trial: optuna_objective_xgboost(trial, optuna_bounds, X_train, y_train),
    n_trials=n_optimization_trials,
    )

plot_optuna_study(study)

print('Best params:')
print(study.best_params)
print('Test logloss for best params:')
print(study.best_value)

In [None]:
# find nr_estimators for best params (here test-logloss-mean is the average log-loss in cross-validation test)
cv_df = cross_validation(X_train, y_train, study.best_params)
n_estimators = cv_df.sort_values(f'test-rmse-mean').index[0] + 1
cv_df[['train-rmse-mean', 'test-rmse-mean']].plot();
print(f'Number of estimators: {n_estimators}')

## Test best model

In [None]:
xgb_params = study.best_params
xgb_params['n_estimators'] = int(n_estimators)

model, results = train_xgboost(xgb_params, X_train, y_train, X_test, y_test)

In [None]:
results_dict = xgb_params.copy()
results_dict['train_rmse'] = results['train_rmse']
results_dict['test_rmse'] = results['test_rmse']
results_dict['train_r2'] = results['train_r2']
results_dict['test_r2'] = results['test_r2']
results_dict

In [None]:
explainer = shap.Explainer(model.predict, X_train)
shap_values = explainer(X_train.sample(2000, random_state=42))
shap.plots.bar(shap_values, max_display=20)

## Train model with full data and save model

In [None]:
model, results = train_xgboost(xgb_params, X, y)

In [None]:
explainer = shap.Explainer(model.predict, X)
shap_values = explainer(X.sample(2000, random_state=42))
shap.plots.bar(shap_values, max_display=20)

In [None]:
# SAVE MODEL AND RESULTS 
time_stamp = str(dt.datetime.now().strftime("%Y%m%d-%H%M%S"))

# save model as pickle file
pickle.dump(model, open(f"../models/xgboost_{time_stamp}.pkl", 'wb'))

# save results
results_path = Path(f'../training_results/xgboost_{time_stamp}.json')
with open(results_path, 'w') as json_file:
    json.dump(results_dict, json_file)

## Further Shap analysis

In [None]:
# Goalkeepers
explainer = shap.Explainer(model.predict, X)
shap_values = explainer(X[X.element_type==1]) #.sample(2000, random_state=42)
shap.plots.bar(shap_values, max_display=20)

## Misc: Model performance check

In [None]:
season_folder = 'season24_25'
model_file_name = 'xgboost_20240813-184939.pkl'
path = Path(f'../models/{model_file_name}')
model = pickle.load(open(path, 'rb'))
display(model)

In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score

In [None]:
# predictions
y_predicted = model.predict(X_train)

# measure performance
rmse_train = root_mean_squared_error(y_train, y_predicted)
print(f'RMSE (train): {rmse_train}')
r2_train = r2_score(y_train, y_predicted)
print(f'R^2 (train): {r2_train}')

# predictions
y_predicted = model.predict(X_test)        

# measure performance
rmse_test = root_mean_squared_error(y_test, y_predicted)
print(f'RMSE (test): {rmse_test}')
r2_test = r2_score(y_test, y_predicted)
print(f'R^2 (test): {r2_test}')