In [1]:
shift_param = 1

In [2]:
import pandas as pd
import numpy as np

from pathlib import Path
import os
import datetime as dt
import pickle

from sklearn.linear_model import LogisticRegression
from scipy.stats import poisson

import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [3]:
# model for estimating bonus points based on gameweek bps
model_path = Path(f"../../models/logistic_regression_for_bonus_points.pkl")
with open(model_path, "rb") as f:
    clf = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Functions

In [4]:
def fpl_data_processing(df, columns):

    xg_data = []
    xa_data = []
    xga_data = []
    for ix, row in df.iterrows():
        my_gameweek = row['gameweek']
        xg_data.append( row[f'xG_week{my_gameweek}'] )
        xa_data.append( row[f'xA_week{my_gameweek}'] )
        xga_data.append( row[f'xGA_week{my_gameweek}'] )

    df['gameweek_xG'] = xg_data
    df['gameweek_xA'] = xa_data
    df['gameweek_xGA'] = xga_data

    df_new = df[columns].copy()

    return df_new

In [5]:
def my_fill_na(x, gameweek_col, diff_col):
    '''Fill nan values for first items for grouped variables where diff is calculated. But also don't fill for season 22-23,
    where data is missing for a number of weeks at the beginning of the season.'''
    my_value = x[diff_col] if (np.isnan(x[gameweek_col])) & (x['minutes']<=90) else x[gameweek_col]
    return my_value
    

In [6]:
def calculate_xPoints(x,clf):
    """Expected points for a given gameweek given underlying stats for that gameweek."""

    clean_sheet_points = np.array([4,4,1,0])
    goal_points = np.array([6,6,5,4])

    # calculate expexted points
    points_played = np.array([1 if x['gameweek_minutes']>0 else 0])
    points_played_over_60 = np.array([1 if x['gameweek_minutes']>=60 else 0])
    points_xG = goal_points[x['element_type']-1] * x['gameweek_xG']
    points_xA = x['gameweek_xA'] * 3
    clean_sheet_probability = np.array(poisson.pmf(0,x['team_xGA']))
    points_clean_sheet = [clean_sheet_points[x['element_type']-1] * clean_sheet_probability if x['gameweek_minutes']>=60 else 0]
    points_saves = x['gameweek_saves'] // 3
    points_penalty_saves = x['gameweek_penalties_saved'] * 5 * 0.21 #points for save times approx. probability of penalty save
    #penalty_for_penalty_miss = x['Performance_PKatt'] * (-2*0.21) # this data only on fbref
    # estimate bonus points
    if not np.isnan(x['gameweek_bps']):
        y_pred_prob = clf.predict_proba(np.array(x['gameweek_bps']).reshape(-1, 1))
    else:
        # return nan if bonus points can't be estimated 
        return np.nan
    points_bonus = np.matmul(y_pred_prob, np.array([0,1,2,3]).reshape((4,1)))
    
    # penalty for possible points deductions based on goals conceded
    xGA = x['team_xGA']
    # calculate penalty
    xGA_conceded_penalty = -(poisson.pmf(2,xGA)+poisson.pmf(3,xGA))-(poisson.pmf(4,xGA)+poisson.pmf(5,xGA))-(poisson.pmf(6,xGA)+poisson.pmf(7,xGA))-(poisson.pmf(8,xGA)+poisson.pmf(9,xGA)-(poisson.pmf(10,xGA)+poisson.pmf(11,xGA)))
    # apply penalty only to GK and DEF
    if (x['element_type']==1) | (x['element_type']==2):
        xGA_conceded_penalty = xGA_conceded_penalty
    else:
        xGA_conceded_penalty = 0
    # scale penalty with playing time
    xGA_conceded_penalty = (x['gameweek_minutes'] / 90) * xGA_conceded_penalty

    penalty_for_cards = [-3 if x['gameweek_red_cards']==1 else -1 if x['gameweek_yellow_cards']==1 else 0]
    penalty_for_own_goal = -2 * x['gameweek_own_goals']

    # add up all point components
    total_points = float(points_played + points_played_over_60 + points_xG + points_xA + points_clean_sheet + points_saves +\
                    points_penalty_saves + points_bonus + xGA_conceded_penalty +\
                    penalty_for_cards + penalty_for_own_goal)
    
    return total_points

# Fetch data

In [7]:
# name all seasons for which data is fetched
seasons = ['20-21', '21-22', '22-23']

# which columns to get for each season
columns = ['assists', 'bonus', 'bps', 'clean_sheets', 'corners_and_indirect_freekicks_order', 'creativity', 'creativity_rank', 
                 'creativity_rank_type', 'direct_freekicks_order', 'dreamteam_count', 'element_type', 'event_points', 'first_name', 
                 'goals_conceded', 'goals_scored', 'ict_index', 'ict_index_rank', 'ict_index_rank_type', 'influence', 'influence_rank',
                 'influence_rank_type', 'minutes', 'now_cost', 'own_goals', 'penalties_missed', 'penalties_order', 'penalties_saved',
                 'points_per_game', 'red_cards', 'saves', 'second_name', 'selected_by_percent', 'threat', 'threat_rank', 'threat_rank_type',
                 'total_points', 'web_name', 'yellow_cards', 'team_name', 'gameweek', 'season', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA']

In [8]:
data = []

# all data for season 20-21
for i in range(38):
    filepath = Path(f'../../../season20_21/data/fpl/data_week{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '20-21'
    gameweek_data = fpl_data_processing(gameweek_data, columns)
    data.append(gameweek_data)

# all data for season 21-22
for i in range(38):
    filepath = Path(f'../../../season21_22/data/fpl/data_week{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '21-22'
    gameweek_data = fpl_data_processing(gameweek_data, columns)
    data.append(gameweek_data)

# all data for season 22-23
for i in range(17,38):
    filepath = Path(f'../../../season22_23/data/fpl_data/fpl_week_{i+1}.csv')
    gameweek_data = pd.read_csv(filepath, index_col=0)
    gameweek_data['gameweek'] = i+1
    gameweek_data['season'] = '22-23'
    # the gameweek variables are here season sum totals, will be fixed in processing
    gameweek_data = gameweek_data.rename(columns={'team':'team_name', 'expected_goals':'gameweek_xG', 
                                                  'expected_assists':'gameweek_xA', 'expected_goals_conceded':'gameweek_xGA'})
    gameweek_data = gameweek_data[columns]
    data.append(gameweek_data)

fpl_df = pd.concat(data, ignore_index=True)

display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA
0,0,0,0,0,4.0,0.0,511,202,,0,3,0,Mesut,0,0,0.0,511,202,0.0,511,202,0,70,0,0,,0,0.0,0,0,Özil,1.1,0.0,507,199,0,Özil,0,Arsenal,1,20-21,,,
1,0,0,0,0,,0.0,348,149,,0,2,0,Sokratis,0,0,0.0,361,150,0.0,354,150,0,50,0,0,,0,0.0,0,0,Papastathopoulos,0.1,0.0,306,138,0,Sokratis,0,Arsenal,1,20-21,,,
2,0,0,0,0,,0.0,431,178,4.0,0,2,0,David,0,0,0.0,432,178,0.0,427,178,0,55,0,0,,0,0.0,0,0,Luiz Moreira Marinho,0.8,0.0,408,174,0,David Luiz,0,Arsenal,1,20-21,,,
3,0,0,19,1,,15.3,49,33,6.0,0,3,7,Pierre-Emerick,0,1,10.6,7,4,36.6,23,8,90,120,0,0,1.0,0,7.0,0,0,Aubameyang,46.3,54.0,8,4,7,Aubameyang,1,Arsenal,1,20-21,0.4,0.0,0.2
4,0,0,0,0,,0.0,342,147,,0,2,0,Cédric,0,0,0.0,356,148,0.0,349,148,0,49,0,0,,0,0.0,0,0,Soares,0.5,0.0,299,136,0,Cédric,0,Arsenal,1,20-21,,,


Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA
65335,0,6,122,5,,95.1,304,46,,0,4,0,Matheus,12,2,45.1,307,44,143.0,355,41,961,53,0,0,,0,2.3,0,0,Santos Carneiro Da Cunha,0.1,217.0,175,46,39,Cunha,1,Wolves,38,22-23,1.72,0.17,16.76
65336,1,0,196,5,,66.3,346,179,,0,3,2,Mario,28,0,31.8,360,162,216.4,298,124,1297,45,0,0,,0,1.8,1,0,Lemina,0.0,36.0,389,189,35,Lemina,2,Wolves,38,22-23,0.25,0.46,27.88
65337,0,3,57,3,2.0,184.7,193,121,4.0,0,3,0,Pablo,9,1,43.0,316,144,92.0,394,166,764,49,0,0,,0,2.1,0,0,Sarabia,0.1,157.0,212,124,27,Sarabia,3,Wolves,38,22-23,2.99,0.89,13.37
65338,0,0,45,0,,0.0,562,41,,0,1,0,Daniel,3,0,7.0,467,33,70.2,419,33,180,40,0,0,,0,3.0,0,11,Bentley,0.0,0.0,524,32,6,Bentley,0,Wolves,38,22-23,0.0,0.0,4.66
65339,0,2,84,2,,74.3,335,174,,0,3,2,João Victor,18,1,25.8,390,173,139.8,358,146,649,45,0,0,,0,1.9,0,0,Gomes da Silva,0.0,46.0,371,180,21,João Gomes,5,Wolves,38,22-23,0.3,0.11,14.35


(65340, 44)

In [9]:
# FPL fixtures needed to get correct gameweeks for each match
# team names have been preprocessed to be compatible

fixtures = []
filepaths = [
    Path(f'../../../season20_21/data/fpl/fixtures.csv'),
    Path(f'../../../season21_22/data/fpl/fixtures.csv'),
    Path(f'../../../season22_23/data/fpl_data/fixtures.csv')
]

for filepath in filepaths:
    df = pd.read_csv(filepath, index_col=0)
    df['kickoff_time'] = pd.to_datetime(df.kickoff_time)
    fixtures.append(df)
    
for i in range(len(filepaths)):
    fixtures[i]['season'] = seasons[i]
    display(fixtures[i].head())
    display(fixtures[i].shape)

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season
0,2128288,1,True,True,2,2020-09-12 11:30:00+00:00,90,False,True,1,3,8,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,58898,Arsenal,Fulham,20-21
1,2128287,1,True,True,1,2020-09-12 14:00:00+00:00,90,False,True,16,0,6,1,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,58897,Southampton,Crystal Palace,20-21
2,2128289,1,True,True,3,2020-09-12 16:30:00+00:00,90,False,True,10,3,11,4,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,3,58899,Leeds,Liverpool,20-21
3,2128293,1,True,True,6,2020-09-12 19:00:00+00:00,90,False,True,14,2,19,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,58903,Newcastle,West Ham,20-21
4,2128292,1,True,True,5,2020-09-13 13:00:00+00:00,90,False,True,9,3,18,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,58902,Leicester,West Brom,20-21


(380, 20)

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season
0,2210271,1,True,True,1,2021-08-13 19:00:00+00:00,90,False,True,1,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",4,2,66342,Arsenal,Brentford,21-22
1,2210276,1,True,True,6,2021-08-14 11:30:00+00:00,90,False,True,10,1,13,5,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,66347,Leeds,Man Utd,21-22
2,2210272,1,True,True,2,2021-08-14 14:00:00+00:00,90,False,True,4,2,5,1,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66343,Brighton,Burnley,21-22
3,2210273,1,True,True,3,2021-08-14 14:00:00+00:00,90,False,True,7,0,6,3,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,66344,Crystal Palace,Chelsea,21-22
4,2210274,1,True,True,4,2021-08-14 14:00:00+00:00,90,False,True,16,1,8,3,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66345,Southampton,Everton,21-22


(380, 20)

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,home_team,away_team,season
0,2292810,1,True,True,1,2022-08-05 19:00:00+00:00,90,False,True,1,2,7,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74911,Crystal Palace,Arsenal,22-23
1,2292813,1,True,True,4,2022-08-06 11:30:00+00:00,90,False,True,12,2,9,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74914,Fulham,Liverpool,22-23
2,2292811,1,True,True,2,2022-08-06 14:00:00+00:00,90,False,True,2,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",3,2,74912,Bournemouth,Aston Villa,22-23
3,2292814,1,True,True,5,2022-08-06 14:00:00+00:00,90,False,True,20,1,11,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,74915,Leeds,Wolves,22-23
4,2292816,1,True,True,7,2022-08-06 14:00:00+00:00,90,False,True,16,0,15,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,74917,Newcastle,Nott'm Forest,22-23


(380, 20)

In [10]:
# FBREF data needed for xg values

fixtures_fbref = []
filepaths = [
    Path(f'../../../season20_21/data/fbref/fixtures20_21.csv'),
    Path(f'../../../season21_22/data/fbref/fixtures21_22.csv'),
    Path(f'../../../season22_23/data/fbref_data/fixtures22_23.csv')
]

for filepath in filepaths:
    df = pd.read_csv(filepath, index_col=0)
    df['Date'] = pd.to_datetime(df['Date'])
    #drop null rows
    df = df[~df.isnull().all(axis=1)] 
    fixtures_fbref.append(df)

for i in range(len(filepaths)):
    display(fixtures_fbref[i].head())
    display(fixtures_fbref[i].shape)

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Sat,2020-09-12,12:30,Fulham,0.2,0–3,1.8,Arsenal,,Craven Cottage,Chris Kavanagh,Match Report,
1,1.0,Sat,2020-09-12,15:00,Crystal Palace,0.7,1–0,0.8,Southampton,,Selhurst Park,Jonathan Moss,Match Report,
2,1.0,Sat,2020-09-12,17:30,Liverpool,3.3,4–3,0.6,Leeds United,,Anfield,Michael Oliver,Match Report,
3,1.0,Sat,2020-09-12,20:00,West Ham,1.1,0–2,1.5,Newcastle Utd,,London Stadium,Stuart Attwell,Match Report,
4,1.0,Sun,2020-09-13,14:00,West Brom,0.5,0–3,2.2,Leicester City,,The Hawthorns,Anthony Taylor,Match Report,


(380, 14)

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2021-08-13,20:00 (22:00),Brentford,1.3,2–0,1.4,Arsenal,16479.0,Brentford Community Stadium,Michael Oliver,Match Report,
1,1.0,Sat,2021-08-14,12:30 (14:30),Manchester Utd,1.5,5–1,0.6,Leeds United,72732.0,Old Trafford,Paul Tierney,Match Report,
2,1.0,Sat,2021-08-14,15:00 (17:00),Leicester City,0.8,1–0,1.1,Wolves,31983.0,King Power Stadium,Craig Pawson,Match Report,
3,1.0,Sat,2021-08-14,15:00 (17:00),Burnley,1.0,1–2,1.3,Brighton,16910.0,Turf Moor,David Coote,Match Report,
4,1.0,Sat,2021-08-14,15:00 (17:00),Chelsea,0.9,3–0,0.3,Crystal Palace,38965.0,Stamford Bridge,Jonathan Moss,Match Report,


(380, 14)

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Match Report,
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,Match Report,
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner,Match Report,
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,52245.0,St James' Park,Simon Hooper,Match Report,
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,Match Report,


(380, 14)

# Data processing

### Fix season 22-23 xg data

In [11]:
# fix season 22-23 xg data
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xG'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xG'].diff()
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xA'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xA'].diff()
fpl_df.loc[fpl_df.season=='22-23', 'gameweek_xGA'] = fpl_df[fpl_df.season=='22-23'].groupby('web_name')['gameweek_xGA'].diff()

fpl_df[['gameweek_xG', 'gameweek_xA', 'gameweek_xGA']] = fpl_df[['gameweek_xG', 'gameweek_xA', 'gameweek_xGA']].fillna(0)

In [12]:
# make sure xg data is always non-negative
fpl_df['gameweek_xG'] = fpl_df['gameweek_xG'].apply(lambda x: np.max((0,x)))
fpl_df['gameweek_xA'] = fpl_df['gameweek_xA'].apply(lambda x: np.max((0,x)))
fpl_df['gameweek_xGA'] = fpl_df['gameweek_xGA'].apply(lambda x: np.max((0,x)))

### Process FPL data

only use data for players that played, i.e., had positive minutes in a given gameweek

In [13]:
# find how many minutes a player played on a given gameweek
fpl_df['gameweek_minutes'] = fpl_df.groupby(['web_name', 'season'])['minutes'].diff()
# fill na caused at the start of each season by taking diff (but don't fill for season 22-23 where early season data is missing)
fpl_df['gameweek_minutes'] = fpl_df.apply(lambda x: my_fill_na(x, 'gameweek_minutes', 'minutes'), axis=1)
print('Number of rows with zero minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes==0].shape[0])
print('Number of rows with over 90 minutes played in a gameweek:')
display(fpl_df[fpl_df.gameweek_minutes>90].shape[0])

Number of rows with zero minutes played in a gameweek:


35674

Number of rows with over 90 minutes played in a gameweek:


3868

In [14]:
# drop rows with 0 minutes or more than 90 minutes
fpl_df = fpl_df[(fpl_df.gameweek_minutes>0) & (fpl_df.gameweek_minutes<=90)].reset_index(drop=True)
display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes
0,0,0,19,1,,15.3,49,33,6.0,0,3,7,Pierre-Emerick,0,1,10.6,7,4,36.6,23,8,90,120,0,0,1.0,0,7.0,0,0,Aubameyang,46.3,54.0,8,4,7,Aubameyang,1,Arsenal,1,20-21,0.4,0.0,0.2,90.0
1,0,1,29,1,,12.7,62,8,5.0,0,4,7,Alexandre,0,1,9.9,14,4,38.6,20,4,86,85,0,0,3.0,0,7.0,0,0,Lacazette,4.9,48.0,11,7,7,Lacazette,0,Arsenal,1,20-21,0.5,0.2,0.2,86.0
2,0,1,29,1,,0.0,370,44,,0,1,7,Bernd,0,0,1.4,169,10,14.0,96,10,90,50,0,0,,0,7.0,0,2,Leno,9.1,0.0,333,44,7,Leno,0,Arsenal,1,20-21,0.0,0.0,0.2,90.0
3,0,0,11,1,,17.2,42,28,2.0,0,3,3,Granit,0,0,4.5,57,29,6.8,150,61,77,55,0,0,,0,3.0,0,0,Xhaka,0.9,21.0,43,20,3,Xhaka,0,Arsenal,1,20-21,0.2,0.1,0.2,77.0
4,0,0,23,1,,26.4,22,7,,0,2,5,Héctor,0,0,4.0,71,21,9.8,128,58,90,50,0,0,,0,5.0,0,0,Bellerín,4.9,4.0,96,24,5,Bellerín,1,Arsenal,1,20-21,0.0,0.2,0.2,90.0


Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes
22471,1,3,239,3,5.0,269.2,136,26,,0,2,0,Hugo,29,0,57.2,263,86,224.0,291,115,1307,38,0,0,,0,2.0,0,0,Bueno López,3.6,76.0,314,101,41,Bueno,1,Wolves,38,22-23,0.0,0.11,1.27,45.0
22472,0,0,17,0,,17.4,427,213,,0,3,1,Joseph,5,0,6.1,471,213,14.4,482,215,172,43,0,0,,0,0.8,0,0,Hodge,0.0,30.0,397,193,5,Hodge,1,Wolves,38,22-23,0.16,0.01,0.03,5.0
22473,1,5,338,11,6.0,406.6,83,66,,0,3,1,Matheus Luiz,38,1,109.3,113,74,371.4,195,80,2467,48,0,0,,0,2.4,1,0,Nunes,0.1,314.0,118,75,80,Matheus,2,Wolves,38,22-23,0.04,0.15,1.27,45.0
22474,1,0,196,5,,66.3,346,179,,0,3,2,Mario,28,0,31.8,360,162,216.4,298,124,1297,45,0,0,,0,1.8,1,0,Lemina,0.0,36.0,389,189,35,Lemina,2,Wolves,38,22-23,0.0,0.0,2.39,90.0
22475,0,2,84,2,,74.3,335,174,,0,3,2,João Victor,18,1,25.8,390,173,139.8,358,146,649,45,0,0,,0,1.9,0,0,Gomes da Silva,0.0,46.0,371,180,21,João Gomes,5,Wolves,38,22-23,0.0,0.01,2.37,84.0


(22476, 45)

In [15]:
fpl_df.isnull().sum() / fpl_df.shape[0]

assists                                 0.000000
bonus                                   0.000000
bps                                     0.000000
clean_sheets                            0.000000
corners_and_indirect_freekicks_order    0.770867
creativity                              0.000000
creativity_rank                         0.000000
creativity_rank_type                    0.000000
direct_freekicks_order                  0.793825
dreamteam_count                         0.000000
element_type                            0.000000
event_points                            0.000000
first_name                              0.000000
goals_conceded                          0.000000
goals_scored                            0.000000
ict_index                               0.000000
ict_index_rank                          0.000000
ict_index_rank_type                     0.000000
influence                               0.000000
influence_rank                          0.000000
influence_rank_type 

In [16]:
fpl_df[fpl_df.web_name=='Aubameyang']

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes
0,0,0,19,1,,15.3,49,33,6.0,0,3,7,Pierre-Emerick,0,1,10.6,7,4,36.6,23,8,90,120,0,0,1.0,0,7.0,0,0,Aubameyang,46.3,54.0,8,4,7,Aubameyang,1,Arsenal,1,20-21,0.4,0.0,0.2,90.0
222,1,0,39,1,,51.7,19,15,6.0,0,3,5,Pierre-Emerick,1,1,18.5,13,9,64.6,31,15,180,119,0,0,1.0,0,6.0,0,0,Aubameyang,32.9,69.0,16,6,12,Aubameyang,1,Arsenal,2,20-21,0.1,0.5,1.9,90.0
482,1,0,44,1,,52.6,39,27,6.0,0,3,2,Pierre-Emerick,4,1,19.4,35,19,68.8,59,25,270,119,0,0,1.0,0,4.7,0,0,Aubameyang,29.2,73.0,29,14,14,Aubameyang,1,Arsenal,3,20-21,0.0,0.0,3.0,90.0
741,1,0,54,1,,57.2,60,44,6.0,0,3,2,Pierre-Emerick,5,1,23.8,41,22,78.2,69,26,360,119,0,0,1.0,0,4.0,0,0,Aubameyang,24.3,103.0,30,13,16,Aubameyang,1,Arsenal,4,20-21,0.0,0.0,0.2,90.0
995,1,0,61,1,,87.1,45,34,6.0,0,3,2,Pierre-Emerick,6,1,27.9,43,23,86.8,87,30,450,118,0,0,1.0,0,3.6,0,0,Aubameyang,17.3,105.0,38,19,18,Aubameyang,1,Arsenal,5,20-21,0.0,0.5,1.4,90.0
1248,1,0,73,1,,103.1,41,31,6.0,0,3,2,Pierre-Emerick,7,1,31.4,50,27,93.4,101,37,540,117,0,0,1.0,0,3.3,0,0,Aubameyang,15.2,117.0,41,22,20,Aubameyang,1,Arsenal,6,20-21,0.3,0.1,1.2,90.0
1502,1,1,104,2,,119.5,39,30,6.0,0,3,8,Pierre-Emerick,7,2,39.0,39,22,127.4,71,26,626,117,0,0,1.0,0,4.0,0,0,Aubameyang,13.8,143.0,41,22,28,Aubameyang,2,Arsenal,7,20-21,0.9,0.1,0.3,86.0
1757,1,1,111,2,,147.8,32,25,6.0,0,3,2,Pierre-Emerick,10,2,43.2,39,21,133.4,79,27,716,117,0,0,1.0,0,3.8,0,0,Aubameyang,13.3,151.0,44,25,30,Aubameyang,2,Arsenal,8,20-21,0.0,0.2,1.8,90.0
2005,1,1,115,3,,162.2,33,25,,0,3,3,Pierre-Emerick,10,2,49.1,41,23,139.0,96,33,806,116,0,0,1.0,0,3.7,0,0,Aubameyang,11.9,190.0,38,21,33,Aubameyang,2,Arsenal,9,20-21,0.3,0.0,2.5,90.0
2259,1,1,117,3,,163.5,37,27,,0,3,2,Pierre-Emerick,12,2,55.4,38,23,139.6,117,42,896,116,0,0,1.0,0,3.5,0,0,Aubameyang,10.4,251.0,31,15,35,Aubameyang,2,Arsenal,10,20-21,0.4,0.0,1.4,90.0


In [17]:
fpl_df.team_name.nunique()

25

### Add xG data to FPL fixtures data

In [18]:
# map fixtures team names to fpl team names

for i in range(len(fixtures)):
    # map fbref team names to fpl team names
    team_name_dict = dict(zip(np.sort(fixtures[i].home_team.unique()), np.sort(fpl_df.loc[fpl_df.season==seasons[i], 'team_name'].unique())))
    display(team_name_dict)

    fixtures[i]['home_team'] = fixtures[i]['home_team'].apply(lambda x: team_name_dict[x])
    fixtures[i]['away_team'] = fixtures[i]['away_team'].apply(lambda x: team_name_dict[x])
    display(fixtures[i].head())

{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Brighton': 'Brighton',
 'Burnley': 'Burnley',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Fulham': 'Fulham',
 'Leeds': 'Leeds United',
 'Leicester': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Man City': 'Manchester City',
 'Man Utd': 'Manchester Utd',
 'Newcastle': 'Newcastle Utd',
 'Sheffield Utd': 'Sheffield Utd',
 'Southampton': 'Southampton',
 'Spurs': 'Tottenham',
 'West Brom': 'West Brom',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season
0,2128288,1,True,True,2,2020-09-12 11:30:00+00:00,90,False,True,1,3,8,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,58898,Arsenal,Fulham,20-21
1,2128287,1,True,True,1,2020-09-12 14:00:00+00:00,90,False,True,16,0,6,1,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,58897,Southampton,Crystal Palace,20-21
2,2128289,1,True,True,3,2020-09-12 16:30:00+00:00,90,False,True,10,3,11,4,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,3,58899,Leeds United,Liverpool,20-21
3,2128293,1,True,True,6,2020-09-12 19:00:00+00:00,90,False,True,14,2,19,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,58903,Newcastle Utd,West Ham,20-21
4,2128292,1,True,True,5,2020-09-13 13:00:00+00:00,90,False,True,9,3,18,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,58902,Leicester City,West Brom,20-21


{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Brentford': 'Brentford',
 'Brighton': 'Brighton',
 'Burnley': 'Burnley',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Leeds': 'Leeds United',
 'Leicester': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Man City': 'Manchester City',
 'Man Utd': 'Manchester Utd',
 'Newcastle': 'Newcastle Utd',
 'Norwich': 'Norwich City',
 'Southampton': 'Southampton',
 'Spurs': 'Tottenham',
 'Watford': 'Watford',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season
0,2210271,1,True,True,1,2021-08-13 19:00:00+00:00,90,False,True,1,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",4,2,66342,Arsenal,Brentford,21-22
1,2210276,1,True,True,6,2021-08-14 11:30:00+00:00,90,False,True,10,1,13,5,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,66347,Leeds United,Manchester Utd,21-22
2,2210272,1,True,True,2,2021-08-14 14:00:00+00:00,90,False,True,4,2,5,1,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66343,Brighton,Burnley,21-22
3,2210273,1,True,True,3,2021-08-14 14:00:00+00:00,90,False,True,7,0,6,3,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,66344,Crystal Palace,Chelsea,21-22
4,2210274,1,True,True,4,2021-08-14 14:00:00+00:00,90,False,True,16,1,8,3,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66345,Southampton,Everton,21-22


{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Bournemouth': 'Bournemouth',
 'Brentford': 'Brentford',
 'Brighton': 'Brighton',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Fulham': 'Fulham',
 'Leeds': 'Leeds United',
 'Leicester': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Man City': 'Manchester City',
 'Man Utd': 'Manchester Utd',
 'Newcastle': 'Newcastle Utd',
 "Nott'm Forest": 'Nottingham Forest',
 'Southampton': 'Southampton',
 'Spurs': 'Tottenham',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,home_team,away_team,season
0,2292810,1,True,True,1,2022-08-05 19:00:00+00:00,90,False,True,1,2,7,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74911,Crystal Palace,Arsenal,22-23
1,2292813,1,True,True,4,2022-08-06 11:30:00+00:00,90,False,True,12,2,9,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74914,Fulham,Liverpool,22-23
2,2292811,1,True,True,2,2022-08-06 14:00:00+00:00,90,False,True,2,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",3,2,74912,Bournemouth,Aston Villa,22-23
3,2292814,1,True,True,5,2022-08-06 14:00:00+00:00,90,False,True,20,1,11,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,74915,Leeds United,Wolves,22-23
4,2292816,1,True,True,7,2022-08-06 14:00:00+00:00,90,False,True,16,0,15,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,74917,Newcastle Utd,Nottingham Forest,22-23


In [19]:
# map fbref team names to fixtures team names

for i in range(len(fixtures_fbref)):
    # map fbref team names to fpl team names
    team_name_dict = dict(zip(np.sort(fixtures_fbref[i].Home.unique()), np.sort(fixtures[i].home_team.unique())))
    display(team_name_dict)

    fixtures_fbref[i]['Home'] = fixtures_fbref[i]['Home'].apply(lambda x: team_name_dict[x])
    fixtures_fbref[i]['Away'] = fixtures_fbref[i]['Away'].apply(lambda x: team_name_dict[x])
    display(fixtures_fbref[i].head())

{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Brighton': 'Brighton',
 'Burnley': 'Burnley',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Fulham': 'Fulham',
 'Leeds United': 'Leeds United',
 'Leicester City': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Manchester City': 'Manchester City',
 'Manchester Utd': 'Manchester Utd',
 'Newcastle Utd': 'Newcastle Utd',
 'Sheffield Utd': 'Sheffield Utd',
 'Southampton': 'Southampton',
 'Tottenham': 'Tottenham',
 'West Brom': 'West Brom',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Sat,2020-09-12,12:30,Fulham,0.2,0–3,1.8,Arsenal,,Craven Cottage,Chris Kavanagh,Match Report,
1,1.0,Sat,2020-09-12,15:00,Crystal Palace,0.7,1–0,0.8,Southampton,,Selhurst Park,Jonathan Moss,Match Report,
2,1.0,Sat,2020-09-12,17:30,Liverpool,3.3,4–3,0.6,Leeds United,,Anfield,Michael Oliver,Match Report,
3,1.0,Sat,2020-09-12,20:00,West Ham,1.1,0–2,1.5,Newcastle Utd,,London Stadium,Stuart Attwell,Match Report,
4,1.0,Sun,2020-09-13,14:00,West Brom,0.5,0–3,2.2,Leicester City,,The Hawthorns,Anthony Taylor,Match Report,


{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Brentford': 'Brentford',
 'Brighton': 'Brighton',
 'Burnley': 'Burnley',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Leeds United': 'Leeds United',
 'Leicester City': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Manchester City': 'Manchester City',
 'Manchester Utd': 'Manchester Utd',
 'Newcastle Utd': 'Newcastle Utd',
 'Norwich City': 'Norwich City',
 'Southampton': 'Southampton',
 'Tottenham': 'Tottenham',
 'Watford': 'Watford',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2021-08-13,20:00 (22:00),Brentford,1.3,2–0,1.4,Arsenal,16479.0,Brentford Community Stadium,Michael Oliver,Match Report,
1,1.0,Sat,2021-08-14,12:30 (14:30),Manchester Utd,1.5,5–1,0.6,Leeds United,72732.0,Old Trafford,Paul Tierney,Match Report,
2,1.0,Sat,2021-08-14,15:00 (17:00),Leicester City,0.8,1–0,1.1,Wolves,31983.0,King Power Stadium,Craig Pawson,Match Report,
3,1.0,Sat,2021-08-14,15:00 (17:00),Burnley,1.0,1–2,1.3,Brighton,16910.0,Turf Moor,David Coote,Match Report,
4,1.0,Sat,2021-08-14,15:00 (17:00),Chelsea,0.9,3–0,0.3,Crystal Palace,38965.0,Stamford Bridge,Jonathan Moss,Match Report,


{'Arsenal': 'Arsenal',
 'Aston Villa': 'Aston Villa',
 'Bournemouth': 'Bournemouth',
 'Brentford': 'Brentford',
 'Brighton': 'Brighton',
 'Chelsea': 'Chelsea',
 'Crystal Palace': 'Crystal Palace',
 'Everton': 'Everton',
 'Fulham': 'Fulham',
 'Leeds United': 'Leeds United',
 'Leicester City': 'Leicester City',
 'Liverpool': 'Liverpool',
 'Manchester City': 'Manchester City',
 'Manchester Utd': 'Manchester Utd',
 'Newcastle Utd': 'Newcastle Utd',
 "Nott'ham Forest": 'Nottingham Forest',
 'Southampton': 'Southampton',
 'Tottenham': 'Tottenham',
 'West Ham': 'West Ham',
 'Wolves': 'Wolves'}

Unnamed: 0,Wk,Day,Date,Time,Home,xG_home,Score,xG_away,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Match Report,
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,Match Report,
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner,Match Report,
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nottingham Forest,52245.0,St James' Park,Simon Hooper,Match Report,
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,Match Report,


In [20]:
# get xg data to fpl fixtures df

for i in range(len(fixtures)):

    home_xg = []
    away_xg = []
    for ix, row in fixtures[i].iterrows():
        home_team = row.home_team
        away_team = row.away_team
        home_team_xg = fixtures_fbref[i].loc[(fixtures_fbref[i]['Home']==home_team) & (fixtures_fbref[i]['Away']==away_team), 'xG_home'].values[0]
        away_team_xg = fixtures_fbref[i].loc[(fixtures_fbref[i]['Home']==home_team) & (fixtures_fbref[i]['Away']==away_team), 'xG_away'].values[0]
        home_xg.append( home_team_xg )
        away_xg.append( away_team_xg )

    fixtures[i]['xg_home'] = home_xg
    fixtures[i]['xg_away'] = away_xg

    display(fixtures[i].head())
    print('Nulls:')
    display(fixtures[i][['xg_home', 'xg_away']].isnull().sum())

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season,xg_home,xg_away
0,2128288,1,True,True,2,2020-09-12 11:30:00+00:00,90,False,True,1,3,8,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,58898,Arsenal,Fulham,20-21,0.2,1.8
1,2128287,1,True,True,1,2020-09-12 14:00:00+00:00,90,False,True,16,0,6,1,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,58897,Southampton,Crystal Palace,20-21,0.7,0.8
2,2128289,1,True,True,3,2020-09-12 16:30:00+00:00,90,False,True,10,3,11,4,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,3,58899,Leeds United,Liverpool,20-21,3.3,0.6
3,2128293,1,True,True,6,2020-09-12 19:00:00+00:00,90,False,True,14,2,19,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,58903,Newcastle Utd,West Ham,20-21,1.1,1.5
4,2128292,1,True,True,5,2020-09-13 13:00:00+00:00,90,False,True,9,3,18,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,58902,Leicester City,West Brom,20-21,0.5,2.2


Nulls:


xg_home    0
xg_away    0
dtype: int64

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season,xg_home,xg_away
0,2210271,1,True,True,1,2021-08-13 19:00:00+00:00,90,False,True,1,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",4,2,66342,Arsenal,Brentford,21-22,1.3,1.4
1,2210276,1,True,True,6,2021-08-14 11:30:00+00:00,90,False,True,10,1,13,5,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,66347,Leeds United,Manchester Utd,21-22,1.5,0.6
2,2210272,1,True,True,2,2021-08-14 14:00:00+00:00,90,False,True,4,2,5,1,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66343,Brighton,Burnley,21-22,1.0,1.3
3,2210273,1,True,True,3,2021-08-14 14:00:00+00:00,90,False,True,7,0,6,3,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,66344,Crystal Palace,Chelsea,21-22,0.9,0.3
4,2210274,1,True,True,4,2021-08-14 14:00:00+00:00,90,False,True,16,1,8,3,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,66345,Southampton,Everton,21-22,2.4,0.7


Nulls:


xg_home    0
xg_away    0
dtype: int64

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,home_team,away_team,season,xg_home,xg_away
0,2292810,1,True,True,1,2022-08-05 19:00:00+00:00,90,False,True,1,2,7,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74911,Crystal Palace,Arsenal,22-23,1.2,1.0
1,2292813,1,True,True,4,2022-08-06 11:30:00+00:00,90,False,True,12,2,9,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,74914,Fulham,Liverpool,22-23,1.2,1.2
2,2292811,1,True,True,2,2022-08-06 14:00:00+00:00,90,False,True,2,0,3,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",3,2,74912,Bournemouth,Aston Villa,22-23,0.6,0.7
3,2292814,1,True,True,5,2022-08-06 14:00:00+00:00,90,False,True,20,1,11,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,2,74915,Leeds United,Wolves,22-23,0.8,1.3
4,2292816,1,True,True,7,2022-08-06 14:00:00+00:00,90,False,True,16,0,15,2,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,4,74917,Newcastle Utd,Nottingham Forest,22-23,1.7,0.3


Nulls:


xg_home    0
xg_away    0
dtype: int64

### Calculate exponentially weighted moving averages for each teams' xG data

In [21]:
# get each team's data on one row

fixtures_melt_list = []
for i in range(len(fixtures)):

    df = fixtures[i].melt(id_vars=['xg_home', 'xg_away', 'team_h_score', 'team_a_score', 'event', 'kickoff_time', 'id'], value_vars=['home_team', 'away_team'])
    df['season'] = seasons[i]
    fixtures_melt_list.append( df )

fixtures_melt = pd.concat(fixtures_melt_list, ignore_index=True)

display(fixtures_melt.head())
display(fixtures_melt.tail())
display(fixtures_melt.shape)


Unnamed: 0,xg_home,xg_away,team_h_score,team_a_score,event,kickoff_time,id,variable,value,season
0,0.2,1.8,0,3,1,2020-09-12 11:30:00+00:00,2,home_team,Fulham,20-21
1,0.7,0.8,1,0,1,2020-09-12 14:00:00+00:00,1,home_team,Crystal Palace,20-21
2,3.3,0.6,4,3,1,2020-09-12 16:30:00+00:00,3,home_team,Liverpool,20-21
3,1.1,1.5,0,2,1,2020-09-12 19:00:00+00:00,6,home_team,West Ham,20-21
4,0.5,2.2,0,3,1,2020-09-13 13:00:00+00:00,5,home_team,West Brom,20-21


Unnamed: 0,xg_home,xg_away,team_h_score,team_a_score,event,kickoff_time,id,variable,value,season
2275,1.0,0.5,1,0,38,2023-05-28 15:30:00+00:00,376,away_team,Bournemouth,22-23
2276,1.5,2.2,1,4,38,2023-05-28 15:30:00+00:00,377,away_team,Tottenham,22-23
2277,1.4,1.4,2,1,38,2023-05-28 15:30:00+00:00,378,away_team,West Ham,22-23
2278,2.9,1.8,2,1,38,2023-05-28 15:30:00+00:00,379,away_team,Fulham,22-23
2279,1.8,4.0,4,4,38,2023-05-28 15:30:00+00:00,380,away_team,Liverpool,22-23


(2280, 10)

In [22]:
# get team's xG (home xG if at home, away xG if at an away game)
fixtures_melt['xG'] = fixtures_melt.apply(lambda x: x['xg_home'] if x['variable']=='home_team' else x['xg_away'], axis=1)
fixtures_melt['xGA'] = fixtures_melt.apply(lambda x: x['xg_away'] if x['variable']=='home_team' else x['xg_home'], axis=1)

# sort by date
fixtures_melt = fixtures_melt.sort_values(by='kickoff_time').reset_index(drop=True)

# calculate rolling averages
rolling_windows = [5,10,20,40]

for i in rolling_windows:
    fixtures_melt[f'xG_ewm_{i}'] = (fixtures_melt[['value','xG']].groupby(by='value').ewm(alpha=1/i).mean()
                                    .reset_index().sort_values(by='level_1')['xG'].values)
    fixtures_melt[f'xGA_ewm_{i}'] = (fixtures_melt[['value','xGA']].groupby(by='value').ewm(alpha=1/i).mean()
                                    .reset_index().sort_values(by='level_1')['xGA'].values)
    
fixtures_melt

Unnamed: 0,xg_home,xg_away,team_h_score,team_a_score,event,kickoff_time,id,variable,value,season,xG,xGA,xG_ewm_5,xGA_ewm_5,xG_ewm_10,xGA_ewm_10,xG_ewm_20,xGA_ewm_20,xG_ewm_40,xGA_ewm_40
0,0.2,1.8,0,3,1,2020-09-12 11:30:00+00:00,2,home_team,Fulham,20-21,0.2,1.8,0.200000,1.800000,0.200000,1.800000,0.200000,1.800000,0.200000,1.800000
1,0.2,1.8,0,3,1,2020-09-12 11:30:00+00:00,2,away_team,Arsenal,20-21,1.8,0.2,1.800000,0.200000,1.800000,0.200000,1.800000,0.200000,1.800000,0.200000
2,0.7,0.8,1,0,1,2020-09-12 14:00:00+00:00,1,home_team,Crystal Palace,20-21,0.7,0.8,0.700000,0.800000,0.700000,0.800000,0.700000,0.800000,0.700000,0.800000
3,0.7,0.8,1,0,1,2020-09-12 14:00:00+00:00,1,away_team,Southampton,20-21,0.8,0.7,0.800000,0.700000,0.800000,0.700000,0.800000,0.700000,0.800000,0.700000
4,3.3,0.6,4,3,1,2020-09-12 16:30:00+00:00,3,home_team,Liverpool,20-21,3.3,0.6,3.300000,0.600000,3.300000,0.600000,3.300000,0.600000,3.300000,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2275,1.3,1.6,1,0,38,2023-05-28 15:30:00+00:00,373,home_team,Brentford,22-23,1.3,1.6,1.583303,1.372774,1.565994,1.353038,1.513093,1.337407,1.455032,1.326443
2276,2.8,1.4,2,1,38,2023-05-28 15:30:00+00:00,372,home_team,Aston Villa,22-23,2.8,1.4,1.585402,1.129728,1.461127,1.240309,1.371636,1.321846,1.312629,1.347846
2277,2.8,0.5,5,0,38,2023-05-28 15:30:00+00:00,371,home_team,Arsenal,22-23,2.8,0.5,1.679808,1.173587,1.785120,1.238845,1.817679,1.197628,1.766396,1.168456
2278,1.8,4.0,4,4,38,2023-05-28 15:30:00+00:00,380,home_team,Southampton,22-23,1.8,4.0,1.169897,2.449979,1.077406,2.081046,1.047800,1.815814,1.069510,1.666132


In [27]:
# shift team xg data by one so that the target game result is not included
cols_to_shift = [col for col in fixtures_melt if 'ewm' in col]
fixtures_melt[cols_to_shift] = fixtures_melt.groupby('value')[cols_to_shift].shift(1)

In [28]:
# check what a given team's stats look like
fixtures_melt[fixtures_melt.value=='Fulham']

Unnamed: 0,xg_home,xg_away,team_h_score,team_a_score,event,kickoff_time,id,variable,value,season,xG,xGA,xG_ewm_5,xGA_ewm_5,xG_ewm_10,xGA_ewm_10,xG_ewm_20,xGA_ewm_20,xG_ewm_40,xGA_ewm_40
0,0.2,1.8,0,3,1,2020-09-12 11:30:00+00:00,2,home_team,Fulham,20-21,0.2,1.8,,,,,,,,
18,1.7,1.8,4,3,2,2020-09-19 14:00:00+00:00,13,away_team,Fulham,20-21,1.8,1.7,0.2,1.8,0.2,1.8,0.2,1.8,0.2,1.8
52,0.7,1.6,0,3,3,2020-09-28 16:45:00+00:00,22,home_team,Fulham,20-21,0.7,1.6,1.088889,1.744444,1.042105,1.747368,1.020513,1.748718,1.010127,1.749367
70,1.2,0.7,1,0,4,2020-10-04 13:00:00+00:00,38,away_team,Fulham,20-21,0.7,1.2,0.929508,1.685246,0.915867,1.692989,0.908151,1.696582,0.904123,1.698312
84,1.5,1.7,1,1,5,2020-10-18 11:00:00+00:00,46,away_team,Fulham,20-21,1.7,1.5,0.851762,1.520867,0.853097,1.549637,0.852044,1.562728,0.851138,1.568964
100,1.1,2.6,1,2,6,2020-10-24 14:00:00+00:00,53,home_team,Fulham,20-21,1.1,2.6,1.104093,1.51466,1.059906,1.537516,1.039463,1.548863,1.029614,1.554464
132,1.5,0.5,2,0,7,2020-11-02 17:30:00+00:00,61,home_team,Fulham,20-21,1.5,0.5,1.102984,1.808847,1.068463,1.764271,1.050889,1.74726,1.0421,1.739933
147,1.3,1.6,1,0,8,2020-11-07 20:00:00+00:00,78,away_team,Fulham,20-21,1.6,1.3,1.203458,1.477613,1.15118,1.521936,1.125328,1.540529,1.112586,1.549066
164,2.0,1.6,2,3,9,2020-11-22 12:00:00+00:00,81,home_team,Fulham,20-21,2.0,1.6,1.298754,1.434929,1.229985,1.482968,1.195842,1.504798,1.179046,1.515105
193,0.9,1.8,1,2,10,2020-11-30 17:30:00+00:00,94,away_team,Fulham,20-21,1.8,0.9,1.460746,1.473061,1.355685,1.502073,1.304586,1.517672,1.279769,1.525521


In [29]:
# save fixtures_melt
filepath = Path('../../data/modeling/team_data.csv')
fpl_df.to_csv(filepath)

### Add team data to FPL data

In [30]:
fixtures_df = pd.concat(fixtures, ignore_index=True)
fixtures_df

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id,away_team,home_team,season,xg_home,xg_away
0,2128288,1,True,True,2,2020-09-12 11:30:00+00:00,90,False,True,1,3,8,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,58898,Arsenal,Fulham,20-21,0.2,1.8
1,2128287,1,True,True,1,2020-09-12 14:00:00+00:00,90,False,True,16,0,6,1,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,3,58897,Southampton,Crystal Palace,20-21,0.7,0.8
2,2128289,1,True,True,3,2020-09-12 16:30:00+00:00,90,False,True,10,3,11,4,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,3,58899,Leeds United,Liverpool,20-21,3.3,0.6
3,2128293,1,True,True,6,2020-09-12 19:00:00+00:00,90,False,True,14,2,19,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,58903,Newcastle Utd,West Ham,20-21,1.1,1.5
4,2128292,1,True,True,5,2020-09-13 13:00:00+00:00,90,False,True,9,3,18,0,"[{'identifier': 'goals_scored', 'a': [{'value'...",4,2,58902,Leicester City,West Brom,20-21,0.5,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,2293185,38,True,True,376,2023-05-28 15:30:00+00:00,90,False,True,3,0,8,1,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,2,75286,Bournemouth,Everton,22-23,1.0,0.5
1136,2293186,38,True,True,377,2023-05-28 15:30:00+00:00,90,False,True,18,4,11,1,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,75287,Tottenham,Leeds United,22-23,1.5,2.2
1137,2293187,38,True,True,378,2023-05-28 15:30:00+00:00,90,False,True,19,1,10,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,3,75288,West Ham,Leicester City,22-23,1.4,1.4
1138,2293188,38,True,True,379,2023-05-28 15:30:00+00:00,90,False,True,9,1,14,2,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,75289,Fulham,Manchester Utd,22-23,2.9,1.8


In [31]:
# columns to be fetched from team data
col_names = ['xG', 'xGA']
col_names += [f'xG_ewm_{i}' for i in rolling_windows]
col_names += [f'xGA_ewm_{i}' for i in rolling_windows]
nr_cols = len(col_names) 
team_data = []
opponent_data = []
home_indicator = []
count_non_one_games = 0
for ix, row in fpl_df.iterrows():
    gameweek = row.gameweek
    team = row.team_name
    season = row.season
    games = fixtures_melt[(fixtures_melt.value==team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season)]
    if games.shape[0]!=1:
        team_data.append( np.array([np.nan]*nr_cols) )
        opponent_data.append( np.array([np.nan]*nr_cols) )
        home_indicator.append( np.array([np.nan]) )
        count_non_one_games += 1
    elif games.shape[0]==1:
        # add team data
        team_data.append( games[col_names].values.flatten() )
        # find opponent data
        home_game = games.variable.values[0]=='home_team'
        game_id = games.id.values[0]
        if home_game:
            home_indicator.append( np.array([1]) )
            opponent_team = fixtures_df.loc[(fixtures_df.home_team==team) & (fixtures_df.event==gameweek) & (fixtures_df.season==season), 'away_team'].values[0]
        else:
            home_indicator.append( np.array([0]) )
            opponent_team = fixtures_df.loc[(fixtures_df.away_team==team) & (fixtures_df.event==gameweek) & (fixtures_df.season==season), 'home_team'].values[0]
        opponent_games = fixtures_melt[(fixtures_melt.value==opponent_team) & (fixtures_melt.event==gameweek) & (fixtures_melt.season==season) & (fixtures_melt.id==game_id)]
        # add opponent data
        opponent_data.append( opponent_games[col_names].values.flatten() )
    else:
        print(f'Check number of games for ix {ix}!')
     

new_col_names = ['team_'+col for col in col_names]
team_data_df = pd.DataFrame(team_data, columns=new_col_names)
new_oppo_col_names = ['opponent_'+col for col in col_names]
opponent_data_df = pd.DataFrame(opponent_data, columns=new_oppo_col_names)
home_indicator_df = pd.DataFrame(home_indicator, columns=['home'])

fpl_df = fpl_df.join([team_data_df, opponent_data_df, home_indicator_df])

display(fpl_df.head())
display(fpl_df.tail())
display(fpl_df.shape)
print(f'Number of non-one-games: {count_non_one_games}')


Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,team_xG,team_xGA,team_xG_ewm_5,team_xG_ewm_10,team_xG_ewm_20,team_xG_ewm_40,team_xGA_ewm_5,team_xGA_ewm_10,team_xGA_ewm_20,team_xGA_ewm_40,opponent_xG,opponent_xGA,opponent_xG_ewm_5,opponent_xG_ewm_10,opponent_xG_ewm_20,opponent_xG_ewm_40,opponent_xGA_ewm_5,opponent_xGA_ewm_10,opponent_xGA_ewm_20,opponent_xGA_ewm_40,home
0,0,0,19,1,,15.3,49,33,6.0,0,3,7,Pierre-Emerick,0,1,10.6,7,4,36.6,23,8,90,120,0,0,1.0,0,7.0,0,0,Aubameyang,46.3,54.0,8,4,7,Aubameyang,1,Arsenal,1,20-21,0.4,0.0,0.2,90.0,1.8,0.2,,,,,,,,,0.2,1.8,,,,,,,,,0.0
1,0,1,29,1,,12.7,62,8,5.0,0,4,7,Alexandre,0,1,9.9,14,4,38.6,20,4,86,85,0,0,3.0,0,7.0,0,0,Lacazette,4.9,48.0,11,7,7,Lacazette,0,Arsenal,1,20-21,0.5,0.2,0.2,86.0,1.8,0.2,,,,,,,,,0.2,1.8,,,,,,,,,0.0
2,0,1,29,1,,0.0,370,44,,0,1,7,Bernd,0,0,1.4,169,10,14.0,96,10,90,50,0,0,,0,7.0,0,2,Leno,9.1,0.0,333,44,7,Leno,0,Arsenal,1,20-21,0.0,0.0,0.2,90.0,1.8,0.2,,,,,,,,,0.2,1.8,,,,,,,,,0.0
3,0,0,11,1,,17.2,42,28,2.0,0,3,3,Granit,0,0,4.5,57,29,6.8,150,61,77,55,0,0,,0,3.0,0,0,Xhaka,0.9,21.0,43,20,3,Xhaka,0,Arsenal,1,20-21,0.2,0.1,0.2,77.0,1.8,0.2,,,,,,,,,0.2,1.8,,,,,,,,,0.0
4,0,0,23,1,,26.4,22,7,,0,2,5,Héctor,0,0,4.0,71,21,9.8,128,58,90,50,0,0,,0,5.0,0,0,Bellerín,4.9,4.0,96,24,5,Bellerín,1,Arsenal,1,20-21,0.0,0.2,0.2,90.0,1.8,0.2,,,,,,,,,0.2,1.8,,,,,,,,,0.0


Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,team_xG,team_xGA,team_xG_ewm_5,team_xG_ewm_10,team_xG_ewm_20,team_xG_ewm_40,team_xGA_ewm_5,team_xGA_ewm_10,team_xGA_ewm_20,team_xGA_ewm_40,opponent_xG,opponent_xGA,opponent_xG_ewm_5,opponent_xG_ewm_10,opponent_xG_ewm_20,opponent_xG_ewm_40,opponent_xGA_ewm_5,opponent_xGA_ewm_10,opponent_xGA_ewm_20,opponent_xGA_ewm_40,home
22471,1,3,239,3,5.0,269.2,136,26,,0,2,0,Hugo,29,0,57.2,263,86,224.0,291,115,1307,38,0,0,,0,2.0,0,0,Bueno López,3.6,76.0,314,101,41,Bueno,1,Wolves,38,22-23,0.0,0.11,1.27,45.0,0.5,2.8,0.982624,0.985665,0.980812,0.978357,2.023731,1.819609,1.693806,1.600505,2.8,0.5,1.39976,1.672355,1.76582,1.738285,1.341984,1.320939,1.234457,1.186636,0.0
22472,0,0,17,0,,17.4,427,213,,0,3,1,Joseph,5,0,6.1,471,213,14.4,482,215,172,43,0,0,,0,0.8,0,0,Hodge,0.0,30.0,397,193,5,Hodge,1,Wolves,38,22-23,0.16,0.01,0.03,5.0,0.5,2.8,0.982624,0.985665,0.980812,0.978357,2.023731,1.819609,1.693806,1.600505,2.8,0.5,1.39976,1.672355,1.76582,1.738285,1.341984,1.320939,1.234457,1.186636,0.0
22473,1,5,338,11,6.0,406.6,83,66,,0,3,1,Matheus Luiz,38,1,109.3,113,74,371.4,195,80,2467,48,0,0,,0,2.4,1,0,Nunes,0.1,314.0,118,75,80,Matheus,2,Wolves,38,22-23,0.04,0.15,1.27,45.0,0.5,2.8,0.982624,0.985665,0.980812,0.978357,2.023731,1.819609,1.693806,1.600505,2.8,0.5,1.39976,1.672355,1.76582,1.738285,1.341984,1.320939,1.234457,1.186636,0.0
22474,1,0,196,5,,66.3,346,179,,0,3,2,Mario,28,0,31.8,360,162,216.4,298,124,1297,45,0,0,,0,1.8,1,0,Lemina,0.0,36.0,389,189,35,Lemina,2,Wolves,38,22-23,0.0,0.0,2.39,90.0,0.5,2.8,0.982624,0.985665,0.980812,0.978357,2.023731,1.819609,1.693806,1.600505,2.8,0.5,1.39976,1.672355,1.76582,1.738285,1.341984,1.320939,1.234457,1.186636,0.0
22475,0,2,84,2,,74.3,335,174,,0,3,2,João Victor,18,1,25.8,390,173,139.8,358,146,649,45,0,0,,0,1.9,0,0,Gomes da Silva,0.0,46.0,371,180,21,João Gomes,5,Wolves,38,22-23,0.0,0.01,2.37,84.0,0.5,2.8,0.982624,0.985665,0.980812,0.978357,2.023731,1.819609,1.693806,1.600505,2.8,0.5,1.39976,1.672355,1.76582,1.738285,1.341984,1.320939,1.234457,1.186636,0.0


(22476, 66)

Number of non-one-games: 940


In [32]:
# should be close to 50%
fpl_df.home.sum() / fpl_df.shape[0]

0.4804680548140238

In [33]:
fpl_df.isnull().sum() 

assists                                     0
bonus                                       0
bps                                         0
clean_sheets                                0
corners_and_indirect_freekicks_order    17326
creativity                                  0
creativity_rank                             0
creativity_rank_type                        0
direct_freekicks_order                  17842
dreamteam_count                             0
element_type                                0
event_points                                0
first_name                                  0
goals_conceded                              0
goals_scored                                0
ict_index                                   0
ict_index_rank                              0
ict_index_rank_type                         0
influence                                   0
influence_rank                              0
influence_rank_type                         0
minutes                           

In [34]:
# drop player data from gameweeks where they had more than one game and team xg data could not be correctly attributed
fpl_df = fpl_df[fpl_df.opponent_xGA_ewm_5.notnull()].reset_index(drop=True)

In [35]:
fpl_df.isnull().sum() / fpl_df.shape[0]

assists                                 0.000000
bonus                                   0.000000
bps                                     0.000000
clean_sheets                            0.000000
corners_and_indirect_freekicks_order    0.769166
creativity                              0.000000
creativity_rank                         0.000000
creativity_rank_type                    0.000000
direct_freekicks_order                  0.791217
dreamteam_count                         0.000000
element_type                            0.000000
event_points                            0.000000
first_name                              0.000000
goals_conceded                          0.000000
goals_scored                            0.000000
ict_index                               0.000000
ict_index_rank                          0.000000
ict_index_rank_type                     0.000000
influence                               0.000000
influence_rank                          0.000000
influence_rank_type 

### FPL gameweek stats

In [36]:
# calculate gameweek stats by looking at differences in cumulative stats

diff_columns = ['assists', 'bps', 'creativity', 'goals_scored', 'goals_conceded', 'own_goals', 'penalties_saved', 
                'red_cards', 'saves', 'threat', 'yellow_cards']

for col in diff_columns:
    fpl_df[f'gameweek_{col}'] = fpl_df.groupby(['web_name', 'season'])[col].diff()#.fillna(fpl_df[col])
    fpl_df[f'gameweek_{col}'] = fpl_df.apply(lambda x: my_fill_na(x, f'gameweek_{col}', col), axis=1)

In [37]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['goals_scored', 'gameweek_goals_scored']]

Unnamed: 0,goals_scored,gameweek_goals_scored
1,1,
211,1,0.0
470,1,0.0
724,1,0.0
977,1,0.0
1231,2,1.0
1486,2,0.0
1734,2,0.0
1988,2,0.0
2238,2,0.0


### FPL expected points

In [38]:
fpl_df['gameweek_xPoints'] = fpl_df.apply(lambda x: calculate_xPoints(x,clf), axis=1)

In [39]:
# proportion of nans
fpl_df['gameweek_xPoints'].isnull().sum() / fpl_df.shape[0]

0.027517316119304528

In [40]:
fig = px.histogram(fpl_df, x='gameweek_xPoints', nbins=40)
fig.show()

In [41]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['gameweek_xPoints']]

Unnamed: 0,gameweek_xPoints
1,
211,2.041019
470,2.820277
724,3.723653
977,4.104416
1231,8.16875
1486,2.783207
1734,3.582265
1988,4.246686
2238,3.606897


### FPL moving averages

In [42]:
# calculate moving averages based on gameweek stats

ewm_columns = ['gameweek_assists', 'gameweek_bps', 'gameweek_creativity', 'event_points', 'gameweek_goals_scored', 'gameweek_goals_conceded', 'gameweek_saves', 
               'gameweek_threat', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA', 'gameweek_minutes', 'gameweek_xPoints']

for i in rolling_windows:
    new_columns = [col+f'_ewm_{i}' for col in ewm_columns]
    fpl_df[new_columns] = fpl_df.groupby('web_name')[ewm_columns].ewm(alpha=1/i).mean().reset_index().sort_values(by='level_1')[ewm_columns].values
    #fpl_df[new_columns] = fpl_df.groupby('web_name')[ewm_columns].rolling(i, min_periods=1, closed='left').mean().reset_index().sort_values(by='level_1')[ewm_columns].values

display(fpl_df.head())
display(fpl_df.shape)

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,team_xG,team_xGA,team_xG_ewm_5,team_xG_ewm_10,team_xG_ewm_20,team_xG_ewm_40,team_xGA_ewm_5,team_xGA_ewm_10,team_xGA_ewm_20,team_xGA_ewm_40,opponent_xG,opponent_xGA,opponent_xG_ewm_5,opponent_xG_ewm_10,opponent_xG_ewm_20,opponent_xG_ewm_40,opponent_xGA_ewm_5,opponent_xGA_ewm_10,opponent_xGA_ewm_20,opponent_xGA_ewm_40,home,gameweek_assists,gameweek_bps,gameweek_creativity,gameweek_goals_scored,gameweek_goals_conceded,gameweek_own_goals,gameweek_penalties_saved,gameweek_red_cards,gameweek_saves,gameweek_threat,gameweek_yellow_cards,gameweek_xPoints,gameweek_assists_ewm_5,gameweek_bps_ewm_5,gameweek_creativity_ewm_5,event_points_ewm_5,gameweek_goals_scored_ewm_5,gameweek_goals_conceded_ewm_5,gameweek_saves_ewm_5,gameweek_threat_ewm_5,gameweek_xG_ewm_5,gameweek_xA_ewm_5,gameweek_xGA_ewm_5,gameweek_minutes_ewm_5,gameweek_xPoints_ewm_5,gameweek_assists_ewm_10,gameweek_bps_ewm_10,gameweek_creativity_ewm_10,event_points_ewm_10,gameweek_goals_scored_ewm_10,gameweek_goals_conceded_ewm_10,gameweek_saves_ewm_10,gameweek_threat_ewm_10,gameweek_xG_ewm_10,gameweek_xA_ewm_10,gameweek_xGA_ewm_10,gameweek_minutes_ewm_10,gameweek_xPoints_ewm_10,gameweek_assists_ewm_20,gameweek_bps_ewm_20,gameweek_creativity_ewm_20,event_points_ewm_20,gameweek_goals_scored_ewm_20,gameweek_goals_conceded_ewm_20,gameweek_saves_ewm_20,gameweek_threat_ewm_20,gameweek_xG_ewm_20,gameweek_xA_ewm_20,gameweek_xGA_ewm_20,gameweek_minutes_ewm_20,gameweek_xPoints_ewm_20,gameweek_assists_ewm_40,gameweek_bps_ewm_40,gameweek_creativity_ewm_40,event_points_ewm_40,gameweek_goals_scored_ewm_40,gameweek_goals_conceded_ewm_40,gameweek_saves_ewm_40,gameweek_threat_ewm_40,gameweek_xG_ewm_40,gameweek_xA_ewm_40,gameweek_xGA_ewm_40,gameweek_minutes_ewm_40,gameweek_xPoints_ewm_40
0,0,0,3,0,,0.0,493,188,4.0,0,2,1,David,0,0,0.0,497,188,0.0,490,188,1,55,0,0,,0,1.0,0,0,Luiz Moreira Marinho,0.9,0.0,479,186,1,David Luiz,0,Arsenal,2,20-21,0.0,0.0,1.9,1.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836
1,1,0,39,1,,51.7,19,15,6.0,0,3,5,Pierre-Emerick,1,1,18.5,13,9,64.6,31,15,180,119,0,0,1.0,0,6.0,0,0,Aubameyang,32.9,69.0,16,6,12,Aubameyang,1,Arsenal,2,20-21,0.1,0.5,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,
2,0,2,58,1,,18.4,95,11,5.0,0,4,7,Alexandre,1,2,16.9,20,8,71.6,25,9,162,85,0,0,3.0,0,7.0,0,0,Lacazette,5.1,79.0,14,10,14,Lacazette,0,Arsenal,2,20-21,0.1,0.0,1.9,76.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,
3,0,1,47,1,,0.0,460,47,,0,1,2,Bernd,1,0,3.2,191,12,32.0,100,12,180,50,0,0,,0,4.5,0,4,Leno,9.0,0.0,435,47,9,Leno,0,Arsenal,2,20-21,0.0,0.0,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,
4,0,0,21,1,,23.9,74,49,2.0,0,3,2,Granit,1,0,6.8,88,44,23.2,133,47,167,55,0,0,,0,2.5,0,0,Xhaka,0.8,21.0,92,49,5,Xhaka,0,Arsenal,2,20-21,0.0,0.0,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,


(21223, 130)

In [43]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', ['gameweek_goals_scored', 'gameweek_goals_scored_ewm_5', 'gameweek_goals_scored_ewm_10', 
                                           'gameweek_goals_scored_ewm_20', 'gameweek_goals_scored_ewm_40']]

Unnamed: 0,gameweek_goals_scored,gameweek_goals_scored_ewm_5,gameweek_goals_scored_ewm_10,gameweek_goals_scored_ewm_20,gameweek_goals_scored_ewm_40
1,,,,,
211,0.0,0.0,0.0,0.0,0.0
470,0.0,0.0,0.0,0.0,0.0
724,0.0,0.0,0.0,0.0,0.0
977,0.0,0.0,0.0,0.0,0.0
1231,1.0,0.297477,0.244194,0.221025,0.210253
1486,0.0,0.216844,0.192078,0.179307,0.172956
1734,0.0,0.161967,0.155261,0.149588,0.146332
1988,0.0,0.123043,0.128,0.127366,0.12638
2238,0.0,0.09462,0.107104,0.110143,0.110874


# FPL expanding stats

In [44]:
expanding_columns = ['gameweek_assists', 'gameweek_bps', 'gameweek_creativity', 'event_points', 'gameweek_goals_scored', 'gameweek_goals_conceded', 'gameweek_saves', 
               'gameweek_threat', 'gameweek_xG', 'gameweek_xA', 'gameweek_xGA', 'gameweek_minutes', 'gameweek_xPoints']
expanding_col_names = [col+'_expanding' for col in expanding_columns]

fpl_df[expanding_col_names] = (
    fpl_df
    .groupby(['web_name'])[expanding_columns]
    .expanding()
    .sum()
    .reset_index()
    .sort_values('level_1')[expanding_columns]
    .values
)

display(fpl_df.head())
display(fpl_df.shape)

Unnamed: 0,assists,bonus,bps,clean_sheets,corners_and_indirect_freekicks_order,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,dreamteam_count,element_type,event_points,first_name,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,influence,influence_rank,influence_rank_type,minutes,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,points_per_game,red_cards,saves,second_name,selected_by_percent,threat,threat_rank,threat_rank_type,total_points,web_name,yellow_cards,team_name,gameweek,season,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,team_xG,team_xGA,team_xG_ewm_5,team_xG_ewm_10,team_xG_ewm_20,team_xG_ewm_40,team_xGA_ewm_5,team_xGA_ewm_10,team_xGA_ewm_20,team_xGA_ewm_40,opponent_xG,opponent_xGA,opponent_xG_ewm_5,opponent_xG_ewm_10,opponent_xG_ewm_20,opponent_xG_ewm_40,opponent_xGA_ewm_5,opponent_xGA_ewm_10,opponent_xGA_ewm_20,opponent_xGA_ewm_40,home,gameweek_assists,gameweek_bps,gameweek_creativity,gameweek_goals_scored,gameweek_goals_conceded,gameweek_own_goals,gameweek_penalties_saved,gameweek_red_cards,gameweek_saves,gameweek_threat,gameweek_yellow_cards,gameweek_xPoints,gameweek_assists_ewm_5,gameweek_bps_ewm_5,gameweek_creativity_ewm_5,event_points_ewm_5,gameweek_goals_scored_ewm_5,gameweek_goals_conceded_ewm_5,gameweek_saves_ewm_5,gameweek_threat_ewm_5,gameweek_xG_ewm_5,gameweek_xA_ewm_5,gameweek_xGA_ewm_5,gameweek_minutes_ewm_5,gameweek_xPoints_ewm_5,gameweek_assists_ewm_10,gameweek_bps_ewm_10,gameweek_creativity_ewm_10,event_points_ewm_10,gameweek_goals_scored_ewm_10,gameweek_goals_conceded_ewm_10,gameweek_saves_ewm_10,gameweek_threat_ewm_10,gameweek_xG_ewm_10,gameweek_xA_ewm_10,gameweek_xGA_ewm_10,gameweek_minutes_ewm_10,gameweek_xPoints_ewm_10,gameweek_assists_ewm_20,gameweek_bps_ewm_20,gameweek_creativity_ewm_20,event_points_ewm_20,gameweek_goals_scored_ewm_20,gameweek_goals_conceded_ewm_20,gameweek_saves_ewm_20,gameweek_threat_ewm_20,gameweek_xG_ewm_20,gameweek_xA_ewm_20,gameweek_xGA_ewm_20,gameweek_minutes_ewm_20,gameweek_xPoints_ewm_20,gameweek_assists_ewm_40,gameweek_bps_ewm_40,gameweek_creativity_ewm_40,event_points_ewm_40,gameweek_goals_scored_ewm_40,gameweek_goals_conceded_ewm_40,gameweek_saves_ewm_40,gameweek_threat_ewm_40,gameweek_xG_ewm_40,gameweek_xA_ewm_40,gameweek_xGA_ewm_40,gameweek_minutes_ewm_40,gameweek_xPoints_ewm_40,gameweek_assists_expanding,gameweek_bps_expanding,gameweek_creativity_expanding,event_points_expanding,gameweek_goals_scored_expanding,gameweek_goals_conceded_expanding,gameweek_saves_expanding,gameweek_threat_expanding,gameweek_xG_expanding,gameweek_xA_expanding,gameweek_xGA_expanding,gameweek_minutes_expanding,gameweek_xPoints_expanding
0,0,0,3,0,,0.0,493,188,4.0,0,2,1,David,0,0,0.0,497,188,0.0,490,188,1,55,0,0,,0,1.0,0,0,Luiz Moreira Marinho,0.9,0.0,479,186,1,David Luiz,0,Arsenal,2,20-21,0.0,0.0,1.9,1.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9,1.0,0.993836
1,1,0,39,1,,51.7,19,15,6.0,0,3,5,Pierre-Emerick,1,1,18.5,13,9,64.6,31,15,180,119,0,0,1.0,0,6.0,0,0,Aubameyang,32.9,69.0,16,6,12,Aubameyang,1,Arsenal,2,20-21,0.1,0.5,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,
2,0,2,58,1,,18.4,95,11,5.0,0,4,7,Alexandre,1,2,16.9,20,8,71.6,25,9,162,85,0,0,3.0,0,7.0,0,0,Lacazette,5.1,79.0,14,10,14,Lacazette,0,Arsenal,2,20-21,0.1,0.0,1.9,76.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,,,,,7.0,,,,,0.1,0.0,1.9,76.0,
3,0,1,47,1,,0.0,460,47,,0,1,2,Bernd,1,0,3.2,191,12,32.0,100,12,180,50,0,0,,0,4.5,0,4,Leno,9.0,0.0,435,47,9,Leno,0,Arsenal,2,20-21,0.0,0.0,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,
4,0,0,21,1,,23.9,74,49,2.0,0,3,2,Granit,1,0,6.8,88,44,23.2,133,47,167,55,0,0,,0,2.5,0,0,Xhaka,0.8,21.0,92,49,5,Xhaka,0,Arsenal,2,20-21,0.0,0.0,1.9,90.0,1.4,1.9,1.8,1.8,1.8,1.8,0.2,0.2,0.2,0.2,1.9,1.4,1.1,1.1,1.1,1.1,1.5,1.5,1.5,1.5,1.0,,,,,,,,,,,,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,,,,,2.0,,,,,0.0,0.0,1.9,90.0,


(21223, 143)

In [45]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', expanding_columns + expanding_col_names + ['season']]

Unnamed: 0,gameweek_assists,gameweek_bps,gameweek_creativity,event_points,gameweek_goals_scored,gameweek_goals_conceded,gameweek_saves,gameweek_threat,gameweek_xG,gameweek_xA,gameweek_xGA,gameweek_minutes,gameweek_xPoints,gameweek_assists_expanding,gameweek_bps_expanding,gameweek_creativity_expanding,event_points_expanding,gameweek_goals_scored_expanding,gameweek_goals_conceded_expanding,gameweek_saves_expanding,gameweek_threat_expanding,gameweek_xG_expanding,gameweek_xA_expanding,gameweek_xGA_expanding,gameweek_minutes_expanding,gameweek_xPoints_expanding,season
1,,,,5,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,20-21
211,0.0,5.0,0.9,2,0.0,3.0,0.0,4.0,0.0,0.0,3.0,90.0,2.041019,0.0,5.0,0.9,7.0,0.0,3.0,0.0,4.0,0.1,0.5,4.9,180.0,2.041019,20-21
470,0.0,10.0,4.6,2,0.0,1.0,0.0,30.0,0.0,0.0,0.2,90.0,2.820277,0.0,15.0,5.5,9.0,0.0,4.0,0.0,34.0,0.1,0.5,5.1,270.0,4.861295,20-21
724,0.0,7.0,29.9,2,0.0,1.0,0.0,2.0,0.0,0.5,1.4,90.0,3.723653,0.0,22.0,35.4,11.0,0.0,5.0,0.0,36.0,0.1,1.0,6.5,360.0,8.584949,20-21
977,0.0,12.0,16.0,2,0.0,1.0,0.0,12.0,0.3,0.1,1.2,90.0,4.104416,0.0,34.0,51.4,13.0,0.0,6.0,0.0,48.0,0.4,1.1,7.7,450.0,12.689365,20-21
1231,0.0,31.0,16.4,8,1.0,0.0,0.0,26.0,0.9,0.1,0.3,86.0,8.16875,0.0,65.0,67.8,21.0,1.0,6.0,0.0,74.0,1.3,1.2,8.0,536.0,20.858115,20-21
1486,0.0,7.0,28.3,2,0.0,3.0,0.0,8.0,0.0,0.2,1.8,90.0,2.783207,0.0,72.0,96.1,23.0,1.0,9.0,0.0,82.0,1.3,1.4,9.8,626.0,23.641322,20-21
1734,0.0,4.0,14.4,3,0.0,0.0,0.0,39.0,0.3,0.0,2.5,90.0,3.582265,0.0,76.0,110.5,26.0,1.0,9.0,0.0,121.0,1.6,1.4,12.3,716.0,27.223587,20-21
1988,0.0,2.0,1.3,2,0.0,2.0,0.0,61.0,0.4,0.0,1.4,90.0,4.246686,0.0,78.0,111.8,28.0,1.0,11.0,0.0,182.0,2.0,1.4,13.7,806.0,31.470273,20-21
2238,0.0,6.0,7.3,2,0.0,2.0,0.0,26.0,0.2,0.0,0.5,90.0,3.606897,0.0,84.0,119.1,30.0,1.0,13.0,0.0,208.0,2.2,1.4,14.2,896.0,35.07717,20-21


# FPL per 90 stats

In [46]:
per_90_columns = [col+'_per90' for col in expanding_col_names]

for i in range(len(per_90_columns)):
    fpl_df[per_90_columns[i]] = fpl_df[expanding_col_names[i]] / fpl_df['gameweek_minutes_expanding'] * 90

In [47]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Aubameyang', per_90_columns + expanding_col_names + ['season']]

Unnamed: 0,gameweek_assists_expanding_per90,gameweek_bps_expanding_per90,gameweek_creativity_expanding_per90,event_points_expanding_per90,gameweek_goals_scored_expanding_per90,gameweek_goals_conceded_expanding_per90,gameweek_saves_expanding_per90,gameweek_threat_expanding_per90,gameweek_xG_expanding_per90,gameweek_xA_expanding_per90,gameweek_xGA_expanding_per90,gameweek_minutes_expanding_per90,gameweek_xPoints_expanding_per90,gameweek_assists_expanding,gameweek_bps_expanding,gameweek_creativity_expanding,event_points_expanding,gameweek_goals_scored_expanding,gameweek_goals_conceded_expanding,gameweek_saves_expanding,gameweek_threat_expanding,gameweek_xG_expanding,gameweek_xA_expanding,gameweek_xGA_expanding,gameweek_minutes_expanding,gameweek_xPoints_expanding,season
1,,,,5.0,,,,,0.1,0.5,1.9,90.0,,,,,5.0,,,,,0.1,0.5,1.9,90.0,,20-21
211,0.0,2.5,0.45,3.5,0.0,1.5,0.0,2.0,0.05,0.25,2.45,90.0,1.020509,0.0,5.0,0.9,7.0,0.0,3.0,0.0,4.0,0.1,0.5,4.9,180.0,2.041019,20-21
470,0.0,5.0,1.833333,3.0,0.0,1.333333,0.0,11.333333,0.033333,0.166667,1.7,90.0,1.620432,0.0,15.0,5.5,9.0,0.0,4.0,0.0,34.0,0.1,0.5,5.1,270.0,4.861295,20-21
724,0.0,5.5,8.85,2.75,0.0,1.25,0.0,9.0,0.025,0.25,1.625,90.0,2.146237,0.0,22.0,35.4,11.0,0.0,5.0,0.0,36.0,0.1,1.0,6.5,360.0,8.584949,20-21
977,0.0,6.8,10.28,2.6,0.0,1.2,0.0,9.6,0.08,0.22,1.54,90.0,2.537873,0.0,34.0,51.4,13.0,0.0,6.0,0.0,48.0,0.4,1.1,7.7,450.0,12.689365,20-21
1231,0.0,10.914179,11.384328,3.526119,0.16791,1.007463,0.0,12.425373,0.218284,0.201493,1.343284,90.0,3.502295,0.0,65.0,67.8,21.0,1.0,6.0,0.0,74.0,1.3,1.2,8.0,536.0,20.858115,20-21
1486,0.0,10.351438,13.816294,3.306709,0.14377,1.29393,0.0,11.789137,0.186901,0.201278,1.408946,90.0,3.398912,0.0,72.0,96.1,23.0,1.0,9.0,0.0,82.0,1.3,1.4,9.8,626.0,23.641322,20-21
1734,0.0,9.553073,13.889665,3.268156,0.125698,1.131285,0.0,15.209497,0.201117,0.175978,1.546089,90.0,3.421959,0.0,76.0,110.5,26.0,1.0,9.0,0.0,121.0,1.6,1.4,12.3,716.0,27.223587,20-21
1988,0.0,8.709677,12.483871,3.126551,0.111663,1.228288,0.0,20.322581,0.223325,0.156328,1.529777,90.0,3.51405,0.0,78.0,111.8,28.0,1.0,11.0,0.0,182.0,2.0,1.4,13.7,806.0,31.470273,20-21
2238,0.0,8.4375,11.96317,3.013393,0.100446,1.305804,0.0,20.892857,0.220982,0.140625,1.426339,90.0,3.523376,0.0,84.0,119.1,30.0,1.0,13.0,0.0,208.0,2.2,1.4,14.2,896.0,35.07717,20-21


# Add xG overperfomance

In [48]:
fpl_df['xG_overperformance'] = fpl_df['gameweek_goals_scored_expanding'] / fpl_df['gameweek_xG_expanding']
# fix if division with zero
fpl_df.loc[np.isinf(fpl_df['xG_overperformance']), 'xG_overperformance'] = 1

In [49]:
# sanity check
fpl_df.loc[fpl_df.web_name=='Son', ['gameweek_goals_scored', 'gameweek_xG', 
                'gameweek_goals_scored_expanding', 'gameweek_xG_expanding', 'gameweek_minutes_expanding',
                'xG_overperformance', 'season']]

Unnamed: 0,gameweek_goals_scored,gameweek_xG,gameweek_goals_scored_expanding,gameweek_xG_expanding,gameweek_minutes_expanding,xG_overperformance,season
174,,1.3,,1.3,90.0,,20-21
421,0.0,0.1,0.0,1.4,135.0,0.0,20-21
677,2.0,0.5,2.0,1.9,207.0,1.052632,20-21
929,1.0,0.5,3.0,2.4,286.0,1.25,20-21
1185,1.0,0.3,4.0,2.7,376.0,1.481481,20-21
1440,0.0,0.2,4.0,2.9,460.0,1.37931,20-21
1685,0.0,0.5,4.0,3.4,550.0,1.176471,20-21
1941,1.0,0.4,5.0,3.8,640.0,1.315789,20-21
2191,0.0,0.0,5.0,3.8,730.0,1.315789,20-21
2417,1.0,0.0,6.0,3.8,817.0,1.578947,20-21


# Save data

In [50]:
filepath = Path('../../data/modeling/fpl_df.csv')
fpl_df.to_csv(filepath)

### 