- Need to add data processing for gameweeks where one team has multiple games. This needs to be done for the "Add team data to FPL data" section. Should be doable utilizing fpl_df.data_retrieved to pick right games (the previous game before the given date).
- Fix Son first_name, second_name issue

In [None]:
shift_param = 1

In [None]:
import pandas as pd
import numpy as np

from pathlib import Path
import os
import datetime as dt
import pickle

from src.utils import fetch_latest_fpl_data

from sklearn.linear_model import LogisticRegression
from scipy.stats import poisson

import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [None]:
# model for estimating bonus points based on gameweek bps
model_path = Path(f"../models/logistic_regression_for_bonus_points.pkl")
with open(model_path, "rb") as f:
    clf = pickle.load(f)

# Functions

In [None]:
def fpl_data_processing(df, columns):

    xg_data = []
    xa_data = []
    xga_data = []
    for ix, row in df.iterrows():
        my_gameweek = row['gameweek']
        xg_data.append( row[f'xG_week{my_gameweek}'] )
        xa_data.append( row[f'xA_week{my_gameweek}'] )
        xga_data.append( row[f'xGA_week{my_gameweek}'] )

    df['gameweek_xG'] = xg_data
    df['gameweek_xA'] = xa_data
    df['gameweek_xGA'] = xga_data

    df_new = df[columns].copy()

    return df_new

In [None]:
def my_fill_na(x, gameweek_col, diff_col):
    '''Fill nan values for first items for grouped variables where diff is calculated. But also don't fill for season 22-23,
    where data is missing for a number of weeks at the beginning of the season.'''
    my_value = x[diff_col] if (np.isnan(x[gameweek_col])) & (x['minutes']<=90) else x[gameweek_col]
    return my_value
    

In [None]:
def calculate_xPoints(x,clf):
    """Expected points for a given gameweek given underlying stats for that gameweek."""

    clean_sheet_points = np.array([4,4,1,0])
    goal_points = np.array([6,6,5,4])

    # calculate expexted points
    points_played = np.array([1 if x['gameweek_minutes']>0 else 0])
    points_played_over_60 = np.array([1 if x['gameweek_minutes']>=60 else 0])
    points_xG = goal_points[x['element_type']-1] * x['gameweek_xG']
    points_xA = x['gameweek_xA'] * 3
    clean_sheet_probability = np.array(poisson.pmf(0,x['team_xGA']))
    points_clean_sheet = [clean_sheet_points[x['element_type']-1] * clean_sheet_probability if x['gameweek_minutes']>=60 else 0]
    points_saves = x['gameweek_saves'] // 3
    points_penalty_saves = x['gameweek_penalties_saved'] * 5 * 0.21 #points for save times approx. probability of penalty save
    #penalty_for_penalty_miss = x['Performance_PKatt'] * (-2*0.21) # this data only on fbref
    # estimate bonus points
    if not np.isnan(x['gameweek_bps']):
        y_pred_prob = clf.predict_proba(np.array(x['gameweek_bps']).reshape(-1, 1))
    else:
        # return nan if bonus points can't be estimated 
        return np.nan
    points_bonus = np.matmul(y_pred_prob, np.array([0,1,2,3]).reshape((4,1)))
    
    # penalty for possible points deductions based on goals conceded
    xGA = x['team_xGA']
    # calculate penalty
    xGA_conceded_penalty = -(poisson.pmf(2,xGA)+poisson.pmf(3,xGA))-(poisson.pmf(4,xGA)+poisson.pmf(5,xGA))-(poisson.pmf(6,xGA)+poisson.pmf(7,xGA))-(poisson.pmf(8,xGA)+poisson.pmf(9,xGA)-(poisson.pmf(10,xGA)+poisson.pmf(11,xGA)))
    # apply penalty only to GK and DEF
    if (x['element_type']==1) | (x['element_type']==2):
        xGA_conceded_penalty = xGA_conceded_penalty
    else:
        xGA_conceded_penalty = 0
    # scale penalty with playing time
    xGA_conceded_penalty = (x['gameweek_minutes'] / 90) * xGA_conceded_penalty

    penalty_for_cards = [-3 if x['gameweek_red_cards']==1 else -1 if x['gameweek_yellow_cards']==1 else 0]
    penalty_for_own_goal = -2 * x['gameweek_own_goals']

    # add up all point components
    total_points = float(points_played + points_played_over_60 + points_xG + points_xA + points_clean_sheet + points_saves +\
                    points_penalty_saves + points_bonus + xGA_conceded_penalty +\
                    penalty_for_cards + penalty_for_own_goal)
    
    return total_points

# Fetch data

In [None]:
# fpl data from previous seasons
filepath = Path('../data/modeling/fpl_df.csv')
fpl_df = pd.read_csv(filepath, index_col=0)
display(fpl_df.head())
display(fpl_df.shape)

In [None]:
# fpl data from this season
fpl_df_new = fetch_latest_fpl_data()
# get gameweek data
fpl_df_new['gameweek_xG'] = fpl_df_new.groupby('name')['expected_goals'].diff().fillna(fpl_df_new['expected_goals'])
fpl_df_new['gameweek_xA'] = fpl_df_new.groupby('name')['expected_assists'].diff().fillna(fpl_df_new['expected_assists'])
fpl_df_new['gameweek_xGA'] = fpl_df_new.groupby('name')['expected_goals_conceded'].diff().fillna(fpl_df_new['expected_goals_conceded'])

display(fpl_df_new.head())
display(fpl_df_new.shape)

In [None]:
# concatenate new fpl data with old
fpl_df = pd.concat([fpl_df, fpl_df_new], join='outer').reset_index(drop=True)
display(fpl_df.head())
display(fpl_df.shape)

In [None]:
# rolling team data from past seasons
filepath = Path('../data/modeling/team_data.csv')
team_data = pd.read_csv(filepath, index_col=0)
display(team_data.head())
display(team_data.tail())
display(team_data.shape)

In [None]:
# fpl fixtures data from this season
filepath = Path('../data/fixtures/fpl_fixtures.csv')
fixtures_fpl = pd.read_csv(filepath, index_col=0)
fixtures_fpl = fixtures_fpl[fixtures_fpl.finished]
display(fixtures_fpl.head())
display(fixtures_fpl.shape)

In [None]:
# fbref fixtures data from this season
filepath = Path('../data/fixtures/fbref_fixtures.csv')
fixtures_fbref = pd.read_csv(filepath, index_col=0)
display(fixtures_fbref.head())
display(fixtures_fbref.shape)

# Data processing

### Fix certain player identification issues

In [None]:
# for Heung-Min Son, the first_name and second_name change along the data
# here we fix the first_name and second_name so that player can be correctly identified in the data
fpl_df.loc[fpl_df.web_name=='Son', 'first_name'] = 'Heung-Min'
fpl_df.loc[fpl_df.web_name=='Son', 'second_name'] = 'Son'

### Process FPL data

In [None]:
# find how many minutes a player played on a given gameweek
fpl_df['gameweek_minutes'] = fpl_df.groupby(['first_name', 'second_name', 'season'])['minutes'].diff()
# fill na caused at the start of each season by taking diff (but don't fill for season 22-23 where early season data is missing)
fpl_df['gameweek_minutes'] = fpl_df.apply(lambda x: my_fill_na(x, 'gameweek_minutes', 'minutes'), axis=1)
print('Number of rows with zero minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes==0].shape[0])
print('Number of rows with over 90 minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes>90].shape[0])

In [None]:
# check does the latest season have any problem data (ok if '23-24' does not appear here)
display(fpl_df.loc[fpl_df.gameweek_minutes>90, 'season'].unique())
display(fpl_df.loc[fpl_df.gameweek_minutes==0, 'season'].unique())

In [None]:
# drop rows with 0 minutes or more than 90 minutes
fpl_df = fpl_df[(fpl_df.gameweek_minutes>0) & (fpl_df.gameweek_minutes<=90)].reset_index(drop=True)
display(fpl_df.head())
display(fpl_df.shape)

### Add xG data to FPL fixtures data

In [None]:
np.sort(pd.concat([fixtures_fpl.home_team, fixtures_fpl.away_team]).unique())

In [None]:
# map fbref team names to fpl team names
fbref_teams = np.sort(pd.concat([fixtures_fbref.Home, fixtures_fbref.Away]).unique())
fpl_teams = np.sort(pd.concat([fixtures_fpl.home_team, fixtures_fpl.away_team]).unique())
team_name_dict = dict(zip(fbref_teams, fpl_teams))
display(team_name_dict)

fixtures_fbref['Home'] = fixtures_fbref['Home'].apply(lambda x: team_name_dict[x])
fixtures_fbref['Away'] = fixtures_fbref['Away'].apply(lambda x: team_name_dict[x])
display(fixtures_fbref.head())

In [None]:
home_xg = []
away_xg = []
for ix, row in fixtures_fpl.iterrows():
    home_team = row.home_team
    away_team = row.away_team
    home_team_xg = fixtures_fbref.loc[(fixtures_fbref['Home']==home_team) & (fixtures_fbref['Away']==away_team), 'xG_home'].values[0]
    away_team_xg = fixtures_fbref.loc[(fixtures_fbref['Home']==home_team) & (fixtures_fbref['Away']==away_team), 'xG_away'].values[0]
    home_xg.append( home_team_xg )
    away_xg.append( away_team_xg )

fixtures_fpl['xg_home'] = home_xg
fixtures_fpl['xg_away'] = away_xg

display(fixtures_fpl.head())
print('Nulls:')
display(fixtures_fpl[['xg_home', 'xg_away']].isnull().sum())

### Calculate exponentially weighted moving averages for each teams' xG data

In [None]:
fixtures_melt = fixtures_fpl.melt(id_vars=['xg_home', 'xg_away', 'team_h_score', 'team_a_score', 'event', 'kickoff_time', 'id'], value_vars=['home_team', 'away_team'])
fixtures_melt['season'] = '23-24'
display(fixtures_melt)

In [None]:
# concatenate fixtures_melt with team data (previous seasons)
fixtures_melt = pd.concat([team_data, fixtures_melt], ignore_index=True)
display(fixtures_melt)

In [None]:
# get team's xG (home xG if at home, away xG if at an away game)
fixtures_melt['xG'] = fixtures_melt.apply(lambda x: x['xg_home'] if x['variable']=='home_team' else x['xg_away'], axis=1)
fixtures_melt['xGA'] = fixtures_melt.apply(lambda x: x['xg_away'] if x['variable']=='home_team' else x['xg_home'], axis=1)

# sort by date
fixtures_melt = fixtures_melt.sort_values(by='kickoff_time').reset_index(drop=True)

# calculate rolling averages
rolling_windows = [5,10,20,40]

for i in rolling_windows:
    fixtures_melt[f'xG_ewm_{i}'] = (fixtures_melt[['value','xG']].groupby(by='value').ewm(alpha=1/i).mean()
                                    .reset_index().sort_values(by='level_1')['xG'].values)
    fixtures_melt[f'xGA_ewm_{i}'] = (fixtures_melt[['value','xGA']].groupby(by='value').ewm(alpha=1/i).mean()
                                    .reset_index().sort_values(by='level_1')['xGA'].values)
    
display(fixtures_melt)

In [None]:
# save fixtures_melt
filepath = Path('../data/team_data.csv')
fixtures_melt.to_csv(filepath)

In [None]:
# shift team xg data by one so that the target game result is not included
cols_to_shift = [col for col in fixtures_melt if 'ewm' in col]
fixtures_melt[cols_to_shift] = fixtures_melt.groupby('value')[cols_to_shift].shift(shift_param)

In [None]:
# check what a given team's stats look like
fixtures_melt[fixtures_melt.value=='Fulham']

In [None]:
# save fixtures_melt with shifted variables
filepath = Path(f'../data/team_data_shift{shift_param}.csv')
fixtures_melt.to_csv(filepath)

### Add team data to FPL data

In [None]:
# columns to be fetched from team data
col_names = ['xG', 'xGA']
col_names += [f'xG_ewm_{i}' for i in rolling_windows]
col_names += [f'xGA_ewm_{i}' for i in rolling_windows]
nr_cols = len(col_names) 
team_data = []
opponent_data = []
home_indicator = []
count_non_one_games = 0
for ix, row in fpl_df[fpl_df.season=='23-24'].iterrows():
    gameweek = row.gameweek
    team = row.team_name
    season = row.season
    games = fixtures_melt[(fixtures_melt.value==team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season)]
    if games.shape[0]!=1:
        team_data.append( np.array([np.nan]*nr_cols) )
        opponent_data.append( np.array([np.nan]*nr_cols) )
        home_indicator.append( np.array([np.nan]) )
        count_non_one_games += 1
    elif games.shape[0]==1:
        # add team data
        team_data.append( games[col_names].values.flatten() )
        # find opponent data
        home_game = games.variable.values[0]=='home_team'
        game_id = games.id.values[0]
        if home_game:
            home_indicator.append( np.array([1]) )
            opponent_team = fixtures_fpl.loc[(fixtures_fpl.home_team==team) & (fixtures_fpl.event==gameweek), 'away_team'].values[0]
        else:
            home_indicator.append( np.array([0]) )
            opponent_team = fixtures_fpl.loc[(fixtures_fpl.away_team==team) & (fixtures_fpl.event==gameweek), 'home_team'].values[0]
        opponent_games = fixtures_melt[(fixtures_melt.value==opponent_team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season) & (fixtures_melt.id==game_id)]
        # add opponent data
        opponent_data.append( opponent_games[col_names].values.flatten() )
    else:
        print(f'Check number of games for ix {ix}!')
     

new_col_names = ['team_'+col for col in col_names]
team_data_df = pd.DataFrame(team_data, columns=new_col_names, index=fpl_df[fpl_df.season=='23-24'].index)
new_oppo_col_names = ['opponent_'+col for col in col_names]
opponent_data_df = pd.DataFrame(opponent_data, columns=new_oppo_col_names, index=fpl_df[fpl_df.season=='23-24'].index)
home_indicator_df = pd.DataFrame(home_indicator, columns=['home'], index=fpl_df[fpl_df.season=='23-24'].index)

fpl_df.loc[fpl_df.season=='23-24', new_col_names] = team_data_df
fpl_df.loc[fpl_df.season=='23-24', new_oppo_col_names] = opponent_data_df
fpl_df.loc[fpl_df.season=='23-24', 'home'] = home_indicator_df

display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)
print(f'Number of non-one-games: {count_non_one_games}')


### FPL gameweek stats

In [None]:
# calculate gameweek stats by looking at differences in cumulative stats

diff_columns = ['assists', 'bps', 'creativity', 'goals_scored', 'goals_conceded', 'own_goals', 'penalties_saved', 
                'red_cards', 'saves', 'threat', 'yellow_cards']

for col in diff_columns:
    fpl_df[f'gameweek_{col}'] = fpl_df.groupby(['web_name', 'season'])[col].diff()
    fpl_df[f'gameweek_{col}'] = fpl_df.apply(lambda x: my_fill_na(x, f'gameweek_{col}', col), axis=1)

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['goals_scored', 'gameweek_goals_scored']]

### FPL expected points

In [None]:
fpl_df['gameweek_xPoints'] = fpl_df.apply(lambda x: calculate_xPoints(x,clf), axis=1)

In [None]:
# proportion of nans
fpl_df['gameweek_xPoints'].isnull().sum() / fpl_df.shape[0]

In [None]:
fig = px.histogram(fpl_df, x='gameweek_xPoints', nbins=40)
fig.show()

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['gameweek_xPoints']]

In [None]:
fpl_df.loc[(fpl_df.season=='23-24') & (fpl_df.gameweek_xPoints.isnull())].shape

### FPL moving averages

In [None]:
# calculate moving averages based on gameweek stats

ewm_columns = ['gameweek_assists', 'gameweek_bps', 'gameweek_creativity', 'event_points', 'gameweek_goals_scored', 'gameweek_goals_conceded', 'gameweek_saves', 
               'gameweek_threat', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA', 'gameweek_minutes', 'gameweek_xPoints']

for i in rolling_windows:
    new_columns = [col+f'_ewm_{i}' for col in ewm_columns]
    fpl_df[new_columns] = fpl_df.groupby('web_name')[ewm_columns].ewm(alpha=1/i).mean().reset_index().sort_values(by='level_1')[ewm_columns].values
    #fpl_df[new_columns] = fpl_df.groupby('web_name')[ewm_columns].rolling(i, min_periods=1, closed='left').mean().reset_index().sort_values(by='level_1')[ewm_columns].values

display(fpl_df.head())
display(fpl_df.shape)

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['gameweek_goals_scored', 'gameweek_goals_scored_ewm_5', 'gameweek_goals_scored_ewm_10', 
                                           'gameweek_goals_scored_ewm_20', 'gameweek_goals_scored_ewm_40']]

# FPL expanding stats

In [None]:
expanding_columns = ['gameweek_assists', 'gameweek_bps', 'gameweek_creativity', 'event_points', 'gameweek_goals_scored', 'gameweek_goals_conceded', 'gameweek_saves', 
               'gameweek_threat', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA', 'gameweek_minutes', 'gameweek_xPoints']
expanding_col_names = [col+'_expanding' for col in expanding_columns]

fpl_df[expanding_col_names] = (
    fpl_df
    .groupby(['first_name', 'second_name'])[expanding_columns]
    .expanding()
    .sum()
    .reset_index()
    .sort_values('level_2')[expanding_columns]
    .values
)

display(fpl_df.head())
display(fpl_df.shape)

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', expanding_columns + expanding_col_names + ['season']]

# FPL per 90 stats

In [None]:
per_90_columns = [col+'_per90' for col in expanding_col_names]

for i in range(len(per_90_columns)):
    fpl_df[per_90_columns[i]] = fpl_df[expanding_col_names[i]] / fpl_df['gameweek_minutes_expanding'] * 90

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', per_90_columns + expanding_col_names + ['season']]

# Add xG overperfomance

In [None]:
fpl_df['xG_overperformance'] = fpl_df['gameweek_goals_scored_expanding'] / fpl_df['gameweek_xG_expanding']
# fix if division with zero
fpl_df.loc[np.isinf(fpl_df['xG_overperformance']), 'xG_overperformance'] = 1

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Son', ['gameweek_goals_scored', 'gameweek_xG', 
                'gameweek_goals_scored_expanding', 'gameweek_xG_expanding', 'gameweek_minutes_expanding',
                'xG_overperformance', 'season']]

In [None]:
fpl_df.loc[fpl_df.web_name=='Son', ['first_name', 'second_name', 'web_name', 'season']]

# Save data

In [None]:
filepath = Path('../data/fpl_df.csv')
fpl_df.to_csv(filepath)