In [None]:
# PARAMETERS

# which game in the future are you trying to predict? shift_param=1 means the next game (2 means the one after that etc.)
shift_param = 1

In [None]:
# IMPORTS

import pandas as pd
import numpy as np
from pathlib import Path

from catboost import CatBoostRegressor
import shap

from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import plotly.graph_objects as go

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Functions

In [None]:
def calculate_performance_metrics(y_true, y_predicted):
    mae = mean_absolute_error(y_true, y_predicted)
    rmse = mean_squared_error(y_true, y_predicted, squared=False)
    r2 = r2_score(y_true, y_predicted)
    return (mae, rmse, r2)


# Data processing

In [None]:
# fetch data
filepath = Path('../../data/modelling/fpl_df.csv')
df = pd.read_csv(filepath, index_col=0)
display(df.head())
display(df.shape)

In [None]:
features_no_shift = ['element_type']

features_shift = ['corners_and_indirect_freekicks_order', 'creativity_rank', 
       'direct_freekicks_order', 'ict_index_rank', 'influence_rank',
       'minutes', 'now_cost', 'penalties_order', 'points_per_game', 
       'selected_by_percent', 'threat_rank',
       'team_xG_ewm_5', 'team_xG_ewm_10', 'team_xG_ewm_20',
       'team_xG_ewm_40', 'team_xGA_ewm_5', 'team_xGA_ewm_10',
       'team_xGA_ewm_20', 'team_xGA_ewm_40', 
       'opponent_xG_ewm_5', 'opponent_xG_ewm_10',
       'opponent_xG_ewm_20', 'opponent_xG_ewm_40', 'opponent_xGA_ewm_5',
       'opponent_xGA_ewm_10', 'opponent_xGA_ewm_20',
       'opponent_xGA_ewm_40', 
       'gameweek_assists_ewm_5', 'gameweek_bps_ewm_5',
       'gameweek_creativity_ewm_5', 'event_points_ewm_5',
       'gameweek_goals_scored_ewm_5', 'gameweek_goals_conceded_ewm_5',
       'gameweek_saves_ewm_5', 'gameweek_threat_ewm_5',
       'gameweek_xG_ewm_5', 'gameweek_xA_ewm_5', 'gameweek_xGA_ewm_5',
       'gameweek_minutes_ewm_5', 'gameweek_xPoints_ewm_5',
       'gameweek_assists_ewm_10', 'gameweek_bps_ewm_10',
       'gameweek_creativity_ewm_10', 'event_points_ewm_10',
       'gameweek_goals_scored_ewm_10', 'gameweek_goals_conceded_ewm_10',
       'gameweek_saves_ewm_10', 'gameweek_threat_ewm_10',
       'gameweek_xG_ewm_10', 'gameweek_xA_ewm_10', 'gameweek_xGA_ewm_10',
       'gameweek_minutes_ewm_10', 'gameweek_xPoints_ewm_10',
       'gameweek_assists_ewm_20', 'gameweek_bps_ewm_20',
       'gameweek_creativity_ewm_20', 'event_points_ewm_20',
       'gameweek_goals_scored_ewm_20', 'gameweek_goals_conceded_ewm_20',
       'gameweek_saves_ewm_20', 'gameweek_threat_ewm_20',
       'gameweek_xG_ewm_20', 'gameweek_xA_ewm_20', 'gameweek_xGA_ewm_20',
       'gameweek_minutes_ewm_20', 'gameweek_xPoints_ewm_20',
       'gameweek_assists_ewm_40', 'gameweek_bps_ewm_40',
       'gameweek_creativity_ewm_40', 'event_points_ewm_40',
       'gameweek_goals_scored_ewm_40', 'gameweek_goals_conceded_ewm_40',
       'gameweek_saves_ewm_40', 'gameweek_threat_ewm_40',
       'gameweek_xG_ewm_40', 'gameweek_xA_ewm_40', 'gameweek_xGA_ewm_40',
       'gameweek_minutes_ewm_40', 'gameweek_xPoints_ewm_40',
       'gameweek_assists_expanding', 'gameweek_bps_expanding',
       'gameweek_creativity_expanding', 'event_points_expanding',
       'gameweek_goals_scored_expanding',
       'gameweek_goals_conceded_expanding', 'gameweek_saves_expanding',
       'gameweek_threat_expanding', 'gameweek_xG_expanding',
       'gameweek_xA_expanding', 'gameweek_xGA_expanding',
       'gameweek_minutes_expanding', 'gameweek_xPoints_expanding',
       'gameweek_assists_expanding_per90', 'gameweek_bps_expanding_per90',
       'gameweek_creativity_expanding_per90',
       'event_points_expanding_per90',
       'gameweek_goals_scored_expanding_per90',
       'gameweek_goals_conceded_expanding_per90',
       'gameweek_saves_expanding_per90',
       'gameweek_threat_expanding_per90', 'gameweek_xG_expanding_per90',
       'gameweek_xA_expanding_per90', 'gameweek_xGA_expanding_per90',
       'gameweek_xPoints_expanding_per90', 'xG_overperformance'
    ]

target = ['event_points']

In [None]:
# shift give features
df[features_shift] = df.groupby('web_name')[features_shift].shift(shift_param)
display(df.head())
display(df.tail())
display(df.shape)

In [None]:
df[df.web_name=='Kane']

In [None]:
df.isnull().sum() / df.shape[0]

In [None]:
(df.isnull().sum(axis=1) > 4).sum() / df.shape[0]

In [None]:
# drop rows where too much data missing
df = df[df.isnull().sum(axis=1) <= 4].reset_index(drop=True)
display(df.shape)

In [None]:
X = df[features_no_shift + features_shift].copy()
y = df[target].copy()

display(X.shape)
display(y.shape)

# Split data to train and test sets

In [None]:
# Proportion of season 22-23 data relative to all data
df[df.season=='22-23'].shape[0] / df.shape[0]

Use season 22-23 for testing, rest for training.

In [None]:
train_ix = df[df.season!='22-23'].index
test_ix = df[df.season=='22-23'].index
print(f'Train data size: {len(train_ix)}')
print(f'Test data size: {len(test_ix)}')

In [None]:
display(train_ix)
display(test_ix)

In [None]:
X_train = X.loc[train_ix]
X_test = X.loc[test_ix]
y_train = y.loc[train_ix]
y_test = y.loc[test_ix]

display(X_train)
display(X_test)
display(y_train)
display(y_test)

# Baseline models

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['points_per_game'])
test_metrics = calculate_performance_metrics(y_test, X_test['points_per_game'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_5'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_5'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_10'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_10'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_20'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_20'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

In [None]:
train_metrics = calculate_performance_metrics(y_train, X_train['gameweek_xPoints_ewm_40'])
test_metrics = calculate_performance_metrics(y_test, X_test['gameweek_xPoints_ewm_40'])

results = pd.DataFrame((train_metrics, test_metrics), index=('train', 'test'), columns=('MAE', 'RMSE', 'r2'))
results

# Ridge regression

# Random forest

# CatBoost