In [None]:
latest_gameweek = 13

In [None]:
# PARAMETERS

# which game in the future are you trying to predict? shift_param=1 means the next game (2 means the one after that etc.)
shift_param = 1

In [None]:
# IMPORTS

import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
import os

import catboost 
import shap
import optuna
import mlflow

from sklearn import linear_model
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from src.utils import calculate_performance_metrics

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Fetch data

In [None]:
filepath = Path('../data/fpl_df.csv')
fpl_df = pd.read_csv(filepath, index_col=0, low_memory=False)
display(fpl_df.head())
display(fpl_df.shape)

# Fetch model

In [None]:
# load prediction model
model = catboost.CatBoostRegressor()
path = Path('../models/catboost_20230809-201635.cbm')
model_creation_year = int(str(path)[19:23])
model_creation_month = int(str(path)[23:25])
model_creation_day = int(str(path)[25:27])
model_creation_date = dt.date(model_creation_year, model_creation_month, model_creation_day)
model.load_model(path)
print(f'Model created on {model_creation_date}')

# Test model predictions

Test model predictions on new data collected after the model was trained.

In [None]:
features_no_shift = ['element_type', 'home']

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

features = features_no_shift + features_shift

target = ['event_points']

In [None]:
data_retrieval_times = pd.to_datetime(fpl_df.data_retrieved_datetime).dt.date
data_for_evaluation = fpl_df.loc[(data_retrieval_times > model_creation_date)].copy()
data_for_evaluation = data_for_evaluation.reset_index(drop=True)
display(data_for_evaluation.shape)

In [None]:
X = data_for_evaluation[features].copy()
y_true = data_for_evaluation[target]
y_predicted = model.predict(X)

### All new data

In [None]:
mae, rmse, r2 = calculate_performance_metrics(y_true.values.flatten(), y_predicted)
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'r^2: {r2}')

### Gameweek-by-gameweek results

In [None]:
gameweeks = data_for_evaluation.gameweek.unique()
metrics_list = []
for my_gameweek in gameweeks:
    gameweek_ix = data_for_evaluation[data_for_evaluation.gameweek==my_gameweek].index
    mae, rmse, r2 = calculate_performance_metrics(y_true.loc[gameweek_ix].values.flatten(), y_predicted[gameweek_ix], plot=False)
    metrics_list.append([mae, rmse, r2])
    metrics_dict = {'mae_gameweek':mae, 'rmse_gameweek':rmse, 'r2_gameweek':r2}
    #mlflow.log_metrics(metrics_dict, step=my_gameweek) 

metrics = pd.DataFrame(metrics_list, columns=['mae', 'rmse', 'r2'], index=gameweeks)
display(metrics)

px.line(x=metrics.index, y=metrics.r2, labels={'x':'gameweek', 'y':'r^2'}, markers=True)
    

# Re-fit existing model with new data added

In [None]:
X = fpl_df[features].copy()
y = fpl_df[target]

with mlflow.start_run() as run:
    model.fit(X,y)
    mlflow.catboost.log_model(model, f'gameweek_{latest_gameweek}')