In [None]:
import pandas as pd
import numpy as np

from pathlib import Path
import os
import datetime as dt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

# Functions

In [None]:
def fpl_data_processing(df, columns):

    xg_data = []
    xa_data = []
    xga_data = []
    for ix, row in df.iterrows():
        my_gameweek = row['gameweek']
        xg_data.append( row[f'xG_week{my_gameweek}'] )
        xa_data.append( row[f'xA_week{my_gameweek}'] )
        xga_data.append( row[f'xGA_week{my_gameweek}'] )

    df['gameweek_xG'] = xg_data
    df['gameweek_xA'] = xa_data
    df['gameweek_xGA'] = xga_data

    df_new = df[columns].copy()

    return df_new

# Fetch data

In [None]:
# name all seasons for which data is fetched
seasons = ['20-21', '21-22', '22-23']

# which columns to get for each season
columns = ['assists', 'bonus', 'bps', 'clean_sheets', 'corners_and_indirect_freekicks_order', 'creativity', 'creativity_rank', 
                 'creativity_rank_type', 'direct_freekicks_order', 'dreamteam_count', 'element_type', 'event_points', 'first_name', 
                 'goals_conceded', 'goals_scored', 'ict_index', 'ict_index_rank', 'ict_index_rank_type', 'influence', 'influence_rank',
                 'influence_rank_type', 'minutes', 'now_cost', 'own_goals', 'penalties_missed', 'penalties_order', 'penalties_saved',
                 'points_per_game', 'red_cards', 'saves', 'second_name', 'selected_by_percent', 'threat', 'threat_rank', 'threat_rank_type',
                 'total_points', 'web_name', 'yellow_cards', 'team_name', 'gameweek', 'season', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA']

In [None]:
data = []

# all data for season 20-21
for i in range(38):
    filepath = Path(f'../../../season20_21/data/fpl/data_week{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '20-21'
    gameweek_data = fpl_data_processing(gameweek_data, columns)
    data.append(gameweek_data)

# all data for season 21-22
for i in range(38):
    filepath = Path(f'../../../season21_22/data/fpl/data_week{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '21-22'
    gameweek_data = fpl_data_processing(gameweek_data, columns)
    data.append(gameweek_data)

# all data for season 22-23
for i in range(17,38):
    filepath = Path(f'../../../season22_23/data/fpl_data/fpl_week_{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '22-23'
    # the gamweek variables are here season sum totals, will be fixed in processing
    gameweek_data = gameweek_data.rename(columns={'team':'team_name', 'expected_goals':'gameweek_xG', 
                                                  'expected_assists':'gameweek_xA', 'expected_goals_conceded':'gameweek_xGA'})
    gameweek_data = gameweek_data[columns]
    data.append(gameweek_data)

fpl_df = pd.concat(data, ignore_index=True)

display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)

In [None]:
# FPL fixtures needed to get correct gameweeks for each match
# team names have been preprocessed to be compatible

fixtures = []
filepaths = [
    Path(f'../../../season20_21/data/fpl/fixtures.csv'),
    Path(f'../../../season21_22/data/fpl/fixtures.csv'),
    Path(f'../../../season22_23/data/fpl_data/fixtures.csv')
]

for filepath in filepaths:
    df = pd.read_csv(filepath, index_col=0)
    df['kickoff_time'] = pd.to_datetime(df.kickoff_time)
    fixtures.append(df)
    
for i in range(len(filepaths)):
    fixtures[i]['season'] = seasons[i]
    display(fixtures[i].head())
    display(fixtures[i].shape)

In [None]:
# FBREF data needed for xg values

fixtures_fbref = []
filepaths = [
    Path(f'../../../season20_21/data/fbref/fixtures20_21.csv'),
    Path(f'../../../season21_22/data/fbref/fixtures21_22.csv'),
    Path(f'../../../season22_23/data/fbref_data/fixtures22_23.csv')
]

for filepath in filepaths:
    df = pd.read_csv(filepath, index_col=0)
    df['Date'] = pd.to_datetime(df['Date'])
    #drop null rows
    df = df[~df.isnull().all(axis=1)] 
    fixtures_fbref.append(df)

for i in range(len(filepaths)):
    display(fixtures_fbref[i].head())
    display(fixtures_fbref[i].shape)

# Data processing

### Fix season 22-23 xg data

In [None]:
# fix season 22-23 xg data
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xG'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xG'].diff()
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xA'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xA'].diff()
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xGA'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xGA'].diff()

fpl_df[['gameweek_xG', 'gameweek_xA', 'gameweek_xGA']] = fpl_df[['gameweek_xG', 'gameweek_xA', 'gameweek_xGA']].fillna(0)

In [None]:
# make sure xg data is always non-negative
fpl_df['gameweek_xG'] = fpl_df['gameweek_xG'].apply(lambda x: np.max((0,x)))
fpl_df['gameweek_xA'] = fpl_df['gameweek_xA'].apply(lambda x: np.max((0,x)))
fpl_df['gameweek_xGA'] = fpl_df['gameweek_xGA'].apply(lambda x: np.max((0,x)))

### Process FPL data

In [None]:
# only use data for players that played, i.e., had positive minutes in a given gameweek
fpl_df['gameweek_minutes'] = fpl_df.groupby(['web_name', 'season'])['minutes'].diff().fillna(fpl_df['minutes'])
print('Number of rows with zero minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes==0].shape[0])
print('Number of rows with over 90 minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes>90].shape[0])

In [None]:
# drop rows with 0 minutes or more than 90 minutes
fpl_df = fpl_df[(fpl_df.gameweek_minutes>0) & (fpl_df.gameweek_minutes<=90)].reset_index()
display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)

In [None]:
fpl_df.isnull().sum() / fpl_df.shape[0]

In [None]:
fpl_df[fpl_df.web_name=='Aubameyang']

### Add xG data to FPL fixtures data

In [None]:
# map fbref team names to fpl team names

for i in range(len(fixtures_fbref)):
    # map fbref team names to fpl team names
    team_name_dict = dict(zip(np.sort(fixtures_fbref[i].Home.unique()), np.sort(fixtures[i].home_team.unique())))
    display(team_name_dict)

    fixtures_fbref[i]['Home'] = fixtures_fbref[i]['Home'].apply(lambda x: team_name_dict[x])
    fixtures_fbref[i]['Away'] = fixtures_fbref[i]['Away'].apply(lambda x: team_name_dict[x])
    display(fixtures_fbref[i].head())

In [None]:
# get xg data to fpl fixtures df

for i in range(len(fixtures)):

    home_xg = []
    away_xg = []
    for ix, row in fixtures[i].iterrows():
        home_team = row.home_team
        away_team = row.away_team
        home_team_xg = fixtures_fbref[i].loc[(fixtures_fbref[i]['Home']==home_team) & (fixtures_fbref[i]['Away']==away_team), 'xG_home'].values[0]
        away_team_xg = fixtures_fbref[i].loc[(fixtures_fbref[i]['Home']==home_team) & (fixtures_fbref[i]['Away']==away_team), 'xG_away'].values[0]
        home_xg.append( home_team_xg )
        away_xg.append( away_team_xg )

    fixtures[i]['xg_home'] = home_xg
    fixtures[i]['xg_away'] = away_xg

    display(fixtures[i].head())
    print('Nulls:')
    display(fixtures[i][['xg_home', 'xg_away']].isnull().sum())

### Calculate exponentially weighted moving averages for each teams' xG data

In [None]:
# get each team's data on one row

fixtures_melt_list = []
for i in range(len(fixtures)):

    df = fixtures[i].melt(id_vars=['xg_home', 'xg_away', 'team_h_score', 'team_a_score', 'event', 'kickoff_time', 'id'], value_vars=['home_team', 'away_team'])
    df['season'] = seasons[i]
    fixtures_melt_list.append( df )

fixtures_melt = pd.concat(fixtures_melt_list, ignore_index=True)

display(fixtures_melt.head())
display(fixtures_melt.tail())
display(fixtures_melt.shape)


In [None]:
# get team's xG (home xG if at home, away xG if at an away game)
fixtures_melt['xG'] = fixtures_melt.apply(lambda x: x['xg_home'] if x['variable']=='home_team' else x['xg_away'], axis=1)
fixtures_melt['xGA'] = fixtures_melt.apply(lambda x: x['xg_away'] if x['variable']=='home_team' else x['xg_home'], axis=1)

# sort by date
fixtures_melt = fixtures_melt.sort_values(by='kickoff_time').reset_index(drop=True)

# calculate rolling averages
rolling_windows = [5,10,20,40]

for i in rolling_windows:
    fixtures_melt[f'xG_ewm_{i}'] = fixtures_melt[['value','xG']].groupby(by='value').ewm(alpha=1/i).mean().reset_index().sort_values(by='level_1')['xG'].values
    fixtures_melt[f'xGA_ewm_{i}'] = fixtures_melt[['value','xGA']].groupby(by='value').ewm(alpha=1/i).mean().reset_index().sort_values(by='level_1')['xGA'].values

fixtures_melt

In [None]:
# check what a given team's stats look like
fixtures_melt[fixtures_melt.value=='Fulham']

### Add team data to FPL data

In [None]:
fixtures_df = pd.concat(fixtures, ignore_index=True)
fixtures_df

In [None]:
# columns to be fetched from team data
col_names = [f'xG_ewm_{i}' for i in rolling_windows]
col_names += [f'xGA_ewm_{i}' for i in rolling_windows]
nr_cols = len(col_names) 
team_data = []
opponent_data = []
count_non_one_games = 0
for ix, row in fpl_df.iterrows():
    gameweek = row.gameweek
    team = row.team_name
    season = row.season
    games = fixtures_melt[(fixtures_melt.value==team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season)]
    if games.shape[0]!=1:
        team_data.append( np.array([np.nan]*nr_cols) )
        opponent_data.append( np.array([np.nan]*nr_cols) )
        count_non_one_games += 1
    elif games.shape[0]==1:
        # add team data
        team_data.append( games[col_names].values.flatten() )
        # find opponent data
        home_game = games.variable.values[0]=='home_team'
        game_id = games.id.values[0]
        if home_game:
            opponent_team = fixtures_df.loc[(fixtures_df.home_team==team) & (fixtures_df.event==gameweek) & (fixtures_df.season==season), 'away_team'].values[0]
        else:
            opponent_team = fixtures_df.loc[(fixtures_df.away_team==team) & (fixtures_df.event==gameweek) & (fixtures_df.season==season), 'home_team'].values[0]
        opponent_games = fixtures_melt[(fixtures_melt.value==opponent_team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season) & (fixtures_melt.id==game_id)]
        # add opponent data
        opponent_data.append( opponent_games[col_names].values.flatten() )
    else:
        print(f'Check number of games for ix {ix}!')
     

new_col_names = ['team_'+col for col in col_names]
team_data_df = pd.DataFrame(team_data, columns=new_col_names)
new_oppo_col_names = ['opponent_'+col for col in col_names]
opponent_data_df = pd.DataFrame(opponent_data, columns=new_oppo_col_names)

fpl_df = fpl_df.join([team_data_df, opponent_data_df])

display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)
print(f'Number of non-one-games: {count_non_one_games}')


In [None]:
fpl_df.isnull().sum() 

In [None]:
# drop player data from gameweeks where they had more than one game and team xg data could not be correctly attributed
fpl_df = fpl_df[fpl_df.opponent_xGA_ewm_5.notnull()].reset_index(drop=True)

In [None]:
fpl_df.isnull().sum() / fpl_df.shape[0]

### FPL moving averages

In [None]:
# calculate gameweek stats by looking at differences in cumulative stats

diff_columns = ['assists', 'bps', 'creativity', 'goals_scored', 'goals_conceded', 'saves', 'threat']

for col in diff_columns:
    fpl_df[f'gameweek_{col}'] = fpl_df.groupby(['web_name', 'season'])[col].diff().fillna(fpl_df[col])

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['goals_scored', 'gameweek_goals_scored']]

In [None]:
# calculate moving averages based on gameweek stats

ewm_columns = ['gameweek_assists', 'gameweek_bps', 'gameweek_creativity', 'event_points', 'gameweek_goals_scored', 'gameweek_goals_conceded', 'gameweek_saves', 
               'gameweek_threat', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA', 'gameweek_minutes']

for i in rolling_windows:
    new_columns = [col+f'_ewm_{i}' for col in ewm_columns]
    fpl_df[new_columns] = fpl_df.groupby('web_name')[ewm_columns].ewm(alpha=1/i).mean().reset_index().sort_values(by='level_1')[ewm_columns].values

display(fpl_df.head())
display(fpl_df.shape)

In [None]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['gameweek_goals_scored', 'gameweek_goals_scored_ewm_5', 'gameweek_goals_scored_ewm_10', 
                                           'gameweek_goals_scored_ewm_20', 'gameweek_goals_scored_ewm_40']]

In [None]:
filepath = Path('../../data/modelling/fpl_df.csv')
fpl_df.to_csv(filepath)

### 