In [1]:
latest_gameweek = 12

In [2]:
# PARAMETERS

# which game in the future are you trying to predict? shift_param=1 means the next game (2 means the one after that etc.)
shift_param = 1

In [3]:
# IMPORTS

import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
import os

import catboost 
import shap
import optuna
import mlflow

from sklearn import linear_model
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from src.utils import calculate_performance_metrics

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


# Fetch data

In [4]:
filepath = Path('../data/fpl_df.csv')
fpl_df = pd.read_csv(filepath, index_col=0, low_memory=False)
display(fpl_df.head())
display(fpl_df.shape)

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,team_xG,team_xGA,team_xG_ewm_5,team_xG_ewm_10,team_xG_ewm_20,team_xG_ewm_40,team_xGA_ewm_5,team_xGA_ewm_10,team_xGA_ewm_20,team_xGA_ewm_40,opponent_xG,opponent_xGA,opponent_xG_ewm_5,opponent_xG_ewm_10,opponent_xG_ewm_20,opponent_xG_ewm_40,opponent_xGA_ewm_5,opponent_xGA_ewm_10,opponent_xGA_ewm_20,opponent_xGA_ewm_40,home,gameweek_assists,gameweek_bps,gameweek_creativity,gameweek_goals_scored,gameweek_goals_conceded,gameweek_own_goals,gameweek_penalties_saved,gameweek_red_cards,gameweek_saves,gameweek_threat,gameweek_yellow_cards,gameweek_xPoints,gameweek_assists_ewm_5,gameweek_bps_ewm_5,gameweek_creativity_ewm_5,event_points_ewm_5,gameweek_goals_scored_ewm_5,gameweek_goals_conceded_ewm_5,gameweek_saves_ewm_5,gameweek_threat_ewm_5,gameweek_xG_ewm_5,gameweek_xA_ewm_5,gameweek_xGA_ewm_5,gameweek_minutes_ewm_5,gameweek_xPoints_ewm_5,gameweek_assists_ewm_10,gameweek_bps_ewm_10,gameweek_creativity_ewm_10,event_points_ewm_10,gameweek_goals_scored_ewm_10,gameweek_goals_conceded_ewm_10,gameweek_saves_ewm_10,gameweek_threat_ewm_10,gameweek_xG_ewm_10,...,gameweek_goals_conceded_ewm_20,gameweek_saves_ewm_20,gameweek_threat_ewm_20,gameweek_xG_ewm_20,gameweek_xA_ewm_20,gameweek_xGA_ewm_20,gameweek_minutes_ewm_20,gameweek_xPoints_ewm_20,gameweek_assists_ewm_40,gameweek_bps_ewm_40,gameweek_creativity_ewm_40,event_points_ewm_40,gameweek_goals_scored_ewm_40,gameweek_goals_conceded_ewm_40,gameweek_saves_ewm_40,gameweek_threat_ewm_40,gameweek_xG_ewm_40,gameweek_xA_ewm_40,gameweek_xGA_ewm_40,gameweek_minutes_ewm_40,gameweek_xPoints_ewm_40,gameweek_assists_expanding,gameweek_bps_expanding,gameweek_creativity_expanding,event_points_expanding,gameweek_goals_scored_expanding,gameweek_goals_conceded_expanding,gameweek_saves_expanding,gameweek_threat_expanding,gameweek_xG_expanding,gameweek_xA_expanding,gameweek_xGA_expanding,gameweek_minutes_expanding,gameweek_xPoints_expanding,gameweek_assists_expanding_per90,gameweek_bps_expanding_per90,gameweek_creativity_expanding_per90,event_points_expanding_per90,gameweek_goals_scored_expanding_per90,gameweek_goals_conceded_expanding_per90,gameweek_saves_expanding_per90,gameweek_threat_expanding_per90,gameweek_xG_expanding_per90,gameweek_xA_expanding_per90,gameweek_xGA_expanding_per90,gameweek_minutes_expanding_per90,gameweek_xPoints_expanding_per90,xG_overperformance,chance_of_playing_next_round,chance_of_playing_this_round,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,ep_next,ep_this,form,id,in_dreamteam,news,news_added,photo,special,squad_number,status,team,team_code,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,corners_and_indirect_freekicks_text,direct_freekicks_text,penalties_text,expected_goals_per_90,saves_per_90,expected_assists_per_90,expected_goal_involvements_per_90,expected_goals_conceded_per_90,goals_conceded_per_90,now_cost_rank,now_cost_rank_type,form_rank,form_rank_type,points_per_game_rank,points_per_game_rank_type,selected_rank,selected_rank_type,starts_per_90,clean_sheets_per_90,name,data_retrieved_datetime
0,0,0,3,0,,0.0,493,188,4.0,0,2,1,David,0,0,0.0,497,188,0.0,490,188,1,55,0,0,,0,1.0,0,0,Luiz Moreira Marinho,0.9,0.0,479,186,1,David Luiz,0,Arsenal,2,20-21,0.0,0.0,1.9,1.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,270.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,171.0,90.0,89.445204,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,0,9,0,,5.4,176,43,,0,2,2,Sead,1,0,1.3,259,91,3.6,252,92,90,49,0,0,,0,2.0,0,0,Kolasinac,0.3,4.0,172,48,2,Kolasinac,0,Arsenal,2,20-21,0.0,0.0,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,9.0,5.4,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,2.033159,0.0,9.0,5.4,2.0,0.0,1.0,0.0,4.0,0.0,0.0,1.9,90.0,2.033159,0.0,9.0,5.4,2.0,0.0,1.0,0.0,4.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.9,90.0,2.033159,0.0,9.0,5.4,2.0,0.0,1.0,0.0,4.0,0.0,0.0,1.9,90.0,2.033159,0.0,9.0,5.4,2.0,0.0,1.0,0.0,4.0,0.0,0.0,1.9,90.0,2.033159,0.0,9.0,5.4,2.0,0.0,1.0,0.0,4.0,0.0,0.0,1.9,90.0,2.033159,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,0,6,0,1.0,17.6,97,64,3.0,0,3,1,Nicolas,0,0,3.4,184,86,3.0,257,115,41,78,0,0,2.0,0,1.0,0,0,Pépé,1.1,14.0,123,69,2,Pépé,0,Arsenal,2,20-21,0.1,0.0,1.9,41.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,6.0,17.6,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,1.500366,0.0,6.0,17.6,1.0,0.0,0.0,0.0,14.0,0.1,0.0,1.9,41.0,1.500366,0.0,6.0,17.6,1.0,0.0,0.0,0.0,14.0,0.1,...,0.0,0.0,14.0,0.1,0.0,1.9,41.0,1.500366,0.0,6.0,17.6,1.0,0.0,0.0,0.0,14.0,0.1,0.0,1.9,41.0,1.500366,0.0,6.0,17.6,1.0,0.0,0.0,0.0,14.0,0.1,0.0,1.9,41.0,1.500366,0.0,13.170732,38.634146,2.195122,0.0,0.0,0.0,30.731707,0.219512,0.0,4.170732,90.0,3.293487,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,3,34,0,,0.1,276,37,,0,4,8,Edward,0,1,5.5,118,23,34.0,95,16,16,59,0,0,,0,4.5,0,0,Nketiah,1.2,21.0,93,27,9,Nketiah,0,Arsenal,2,20-21,0.9,0.0,1.9,16.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,34.0,0.1,1.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,6.641532,0.0,34.0,0.1,8.0,1.0,0.0,0.0,21.0,0.9,0.0,1.9,16.0,6.641532,0.0,34.0,0.1,8.0,1.0,0.0,0.0,21.0,0.9,...,0.0,0.0,21.0,0.9,0.0,1.9,16.0,6.641532,0.0,34.0,0.1,8.0,1.0,0.0,0.0,21.0,0.9,0.0,1.9,16.0,6.641532,0.0,34.0,0.1,8.0,1.0,0.0,0.0,21.0,0.9,0.0,1.9,16.0,6.641532,0.0,191.25,0.5625,45.0,5.625,0.0,0.0,118.125,5.0625,0.0,10.6875,90.0,37.358617,1.111111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0,0,8,0,3.0,20.3,89,58,,0,3,2,Bukayo,1,0,5.8,110,58,7.4,222,88,88,54,0,0,,0,2.0,0,0,Saka,4.4,30.0,64,27,2,Saka,0,Arsenal,2,20-21,0.1,0.0,1.9,88.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,8.0,20.3,0.0,1.0,0.0,0.0,0.0,0.0,30.0,0.0,2.650318,0.0,8.0,20.3,2.0,0.0,1.0,0.0,30.0,0.1,0.0,1.9,88.0,2.650318,0.0,8.0,20.3,2.0,0.0,1.0,0.0,30.0,0.1,...,1.0,0.0,30.0,0.1,0.0,1.9,88.0,2.650318,0.0,8.0,20.3,2.0,0.0,1.0,0.0,30.0,0.1,0.0,1.9,88.0,2.650318,0.0,8.0,20.3,2.0,0.0,1.0,0.0,30.0,0.1,0.0,1.9,88.0,2.650318,0.0,8.181818,20.761364,2.045455,0.0,1.022727,0.0,30.681818,0.102273,0.0,1.943182,90.0,2.710552,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


(22441, 209)

# Fetch model

In [5]:
# load prediction model
model = catboost.CatBoostRegressor()
path = Path('../models/catboost_20230809-201635.cbm')
model_creation_year = int(str(path)[19:23])
model_creation_month = int(str(path)[23:25])
model_creation_day = int(str(path)[25:27])
model_creation_date = dt.date(model_creation_year, model_creation_month, model_creation_day)
model.load_model(path)
print(f'Model created on {model_creation_date}')

Model created on 2023-08-09


# Test model predictions

Test model predictions on new data collected after the model was trained.

In [6]:
features_no_shift = ['element_type', 'home']

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

features = features_no_shift + features_shift

target = ['event_points']

In [7]:
data_retrieval_times = pd.to_datetime(fpl_df.data_retrieved_datetime).dt.date
data_for_evaluation = fpl_df.loc[(data_retrieval_times > model_creation_date)].copy()
data_for_evaluation = data_for_evaluation.reset_index(drop=True)
display(data_for_evaluation.shape)

(3602, 209)

In [8]:
X = data_for_evaluation[features].copy()
y_true = data_for_evaluation[target]
y_predicted = model.predict(X)

### All new data

In [9]:
mae, rmse, r2 = calculate_performance_metrics(y_true.values.flatten(), y_predicted)
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'r^2: {r2}')

MAE: 1.862257407204785
RMSE: 2.633739571192478
r^2: 0.2070616668949261


### Gameweek-by-gameweek results

In [10]:
gameweeks = data_for_evaluation.gameweek.unique()
metrics_list = []
for my_gameweek in gameweeks:
    gameweek_ix = data_for_evaluation[data_for_evaluation.gameweek==my_gameweek].index
    mae, rmse, r2 = calculate_performance_metrics(y_true.loc[gameweek_ix].values.flatten(), y_predicted[gameweek_ix], plot=False)
    metrics_list.append([mae, rmse, r2])
    metrics_dict = {'mae_gameweek':mae, 'rmse_gameweek':rmse, 'r2_gameweek':r2}
    #mlflow.log_metrics(metrics_dict, step=my_gameweek) 

metrics = pd.DataFrame(metrics_list, columns=['mae', 'rmse', 'r2'], index=gameweeks)
display(metrics)

px.line(x=metrics.index, y=metrics.r2, labels={'x':'gameweek', 'y':'r^2'}, markers=True)
    

Unnamed: 0,mae,rmse,r2
1,1.55773,2.191583,0.329981
2,1.927052,2.612689,0.226927
3,1.812691,2.61317,0.22846
4,2.000013,2.82683,0.19093
5,1.732764,2.300048,0.162139
6,1.918272,2.737982,0.259489
7,1.896457,2.755213,0.174148
8,1.874056,2.619374,0.176024
9,1.864334,2.595108,0.250796
10,1.793368,2.517777,0.240856


# Re-fit existing model with new data added

In [11]:
X = fpl_df[features].copy()
y = fpl_df[target]

with mlflow.start_run() as run:
    model.fit(X,y)
    mlflow.catboost.log_model(model, f'gameweek_{latest_gameweek}')