# DATA PROCESSING AND CALCULATION OF xG-ADJUSTED FPL POINTS

## Setup and pre-processing

In [1]:
# give the number of the latest FPL round
latest_gameweek = 17

In [2]:
# import basic libraries
import pandas as pd
import numpy as np
import json
import requests
from scipy.stats import poisson

# allow more data columns to be shown than usually
pd.set_option('max_columns',100)

In [3]:
# import player data 
filepath = '../data/fbref/player_stats_week' + str(latest_gameweek) + '.csv'
playerStats = pd.read_csv(filepath, index_col=0, skiprows=1)#, encoding='latin-1')
playerStats.head()

Unnamed: 0_level_0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,Gls,Ast,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK,G+A-PK,xG,npxG,xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA,Matches
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,Patrick van Aanholt\Patrick-van-Aanholt,nl NED,DF,Crystal Palace,28,1990,15,15,1289,2,0,1,1,0,0,0.14,0.0,0.14,0.07,0.07,1.3,0.6,0.4,0.09,0.03,0.12,0.04,0.07,Matches
2,Max Aarons\Max-Aarons,eng ENG,MF,Norwich City,19,2000,15,15,1350,0,0,0,0,4,0,0.0,0.0,0.0,0.0,0.0,0.3,0.3,2.5,0.02,0.17,0.19,0.02,0.19,Matches
3,Tammy Abraham\Tammy-Abraham,eng ENG,FW,Chelsea,21,1997,16,15,1233,11,3,0,0,2,0,0.8,0.22,1.02,0.8,1.02,9.0,9.0,2.2,0.66,0.16,0.82,0.66,0.82,Matches
4,Che Adams\Che-Adams,eng ENG,FW,Southampton,23,1996,12,6,488,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.5,1.5,0.2,0.27,0.04,0.31,0.27,0.31,Matches
5,Adrián\Adrian,es ESP,GK,Liverpool,32,1987,10,8,783,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [4]:
# import team data and pre-process
filepath = '../data/fbref/team_stats_week' + str(latest_gameweek) + '.csv'
teamStats = pd.read_csv(filepath, index_col=0, encoding='latin-1')

# change team names to match convention used in the FPL data
teamStats.loc[teamStats['Squad']=='Brighton & Hove Albion','Squad'] = 'Brighton'
teamStats.loc[teamStats['Squad']=='Manchester United','Squad'] = 'Manchester Utd'
teamStats.loc[teamStats['Squad']=='Newcastle United','Squad'] = 'Newcastle Utd'
teamStats.loc[teamStats['Squad']=='Sheffield United','Squad'] = 'Sheffield Utd'
teamStats.loc[teamStats['Squad']=='West Ham United','Squad'] = 'West Ham'
teamStats.loc[teamStats['Squad']=='Tottenham Hotspur','Squad'] = 'Tottenham'
teamStats.loc[teamStats['Squad']=='Wolverhampton Wanderers','Squad'] = 'Wolves'

teamStats.head()

Unnamed: 0_level_0,Squad,MP,W,D,L,GF,GA,GDiff,Pts,xG,xGA,xGDiff,xGDiff/90,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,xGA per game,probability no goals allowed
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Liverpool,17,16,1,0,42,14,28,49,32.3,17.2,15.1,0.89,W W W W W,53043,Mohamed Salah Sadio ManÃ© - 9,AdriÃ¡n,,1.011765,0.363577
2,Leicester City,17,12,3,2,40,11,29,39,30.3,17.2,13.1,0.77,W W W W D,32027,Jamie Vardy - 16,Kasper Schmeichel,,1.011765,0.363577
3,Manchester City,17,11,2,4,47,19,28,35,44.0,16.9,27.1,1.59,W D W L W,54364,Raheem Sterling Sergio AgÃ¼ero - 9,Ederson,,0.994118,0.37005
4,Chelsea,17,9,2,6,31,25,6,29,32.4,18.6,13.8,0.81,L L W L L,40557,Tammy Abraham - 11,Kepa Arrizabalaga,,1.094118,0.334835
5,Tottenham,17,7,5,5,32,24,8,26,22.6,22.5,0.1,0.0,W W L W W,59459,Harry Kane - 9,Paulo Gazzaniga,,1.323529,0.266194


In [5]:
# fetch FPL data online
data = json.loads(requests.get('https://fantasy.premierleague.com/api/bootstrap-static/').text)
df = pd.DataFrame(data['elements'])
df.set_index('id',inplace=True)

# fetch data locally
#df = pd.read_csv('../data/data_week' + str(latest_gameweek) + '.csv', index_col=0)#,encoding='latin-1')

df.head()

Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,in_dreamteam,influence,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_saved,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,squad_number,status,team,team_code,threat,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
1,1,0,24,100.0,100.0,0,69140,0,0,-3,3,0.8,0,2,1.3,0.3,0,Shkodran,0.8,2,0,7.5,False,19.8,90,,2019-11-28T23:00:21.541666Z,52,0,0,0,69140.jpg,4.0,0,0,Mustafi,0.3,False,,a,1,3,54.0,4,7831,116,32097,285,0.2,0.8,Mustafi,0
2,0,0,26,0.0,75.0,0,98745,0,0,-1,1,22.7,0,2,0.0,-0.2,0,Héctor,0.2,6,0,6.9,False,30.8,262,Hamstring injury - Expected back 21 Dec,2019-12-09T20:00:21.228098Z,54,0,0,0,98745.jpg,0.3,0,0,Bellerín,0.3,False,,i,1,3,15.0,1,34609,34,33648,1577,0.0,0.2,Bellerín,2
3,2,1,196,75.0,100.0,1,111457,0,0,-3,3,149.6,0,2,1.6,1.1,0,Sead,1.6,17,0,37.8,False,174.4,928,Ankle injury - 75% chance of playing,2019-12-15T19:30:19.136195Z,52,0,0,0,111457.jpg,1.9,0,0,Kolasinac,0.6,False,,d,1,3,55.0,27,46926,335,109248,1839,0.3,5.2,Kolasinac,2
4,2,3,124,100.0,100.0,1,154043,0,0,-4,4,127.9,1,2,1.1,0.1,1,Ainsley,0.6,14,0,32.9,False,173.0,670,,2019-09-22T18:00:10.824841Z,46,0,0,0,154043.jpg,2.5,1,0,Maitland-Niles,2.6,False,,a,1,3,33.0,20,546474,920,567069,2824,0.1,4.3,Maitland-Niles,0
5,0,2,224,0.0,,2,39476,0,0,-1,1,30.7,0,2,0.0,-0.1,0,Sokratis,0.4,23,1,44.3,False,340.2,1350,Suspended until 26 Dec,2019-12-15T19:30:19.149412Z,49,0,0,0,39476.jpg,2.2,0,0,Papastathopoulos,1.4,False,,s,1,3,71.0,33,107358,290,144034,6092,0.1,6.7,Sokratis,5


## Probability to keep a clean sheet

Here, we estimate for each team the probability that the team keeps a clean sheet (against average opposition). We do this by first calculating the expected goals allowed per game for each team. Then, we assume that conceding goals follows a Poisson distribution, from which we then get the desired probability.

In [6]:
teamStats['xGA per game'] = teamStats['xGA'] / teamStats['MP']
teamStats['probability no goals allowed'] = poisson.pmf(0,teamStats['xGA per game'])
teamStats

Unnamed: 0_level_0,Squad,MP,W,D,L,GF,GA,GDiff,Pts,xG,xGA,xGDiff,xGDiff/90,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,xGA per game,probability no goals allowed
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Liverpool,17,16,1,0,42,14,28,49,32.3,17.2,15.1,0.89,W W W W W,53043,Mohamed Salah Sadio ManÃ© - 9,AdriÃ¡n,,1.011765,0.363577
2,Leicester City,17,12,3,2,40,11,29,39,30.3,17.2,13.1,0.77,W W W W D,32027,Jamie Vardy - 16,Kasper Schmeichel,,1.011765,0.363577
3,Manchester City,17,11,2,4,47,19,28,35,44.0,16.9,27.1,1.59,W D W L W,54364,Raheem Sterling Sergio AgÃ¼ero - 9,Ederson,,0.994118,0.37005
4,Chelsea,17,9,2,6,31,25,6,29,32.4,18.6,13.8,0.81,L L W L L,40557,Tammy Abraham - 11,Kepa Arrizabalaga,,1.094118,0.334835
5,Tottenham,17,7,5,5,32,24,8,26,22.6,22.5,0.1,0.0,W W L W W,59459,Harry Kane - 9,Paulo Gazzaniga,,1.323529,0.266194
6,Manchester Utd,17,6,7,4,26,20,6,25,27.7,17.3,10.5,0.62,D D W W D,72358,Marcus Rashford - 10,David de Gea,,1.017647,0.361444
7,Sheffield Utd,17,6,7,4,21,16,5,25,20.4,20.1,0.3,0.02,D D L W W,30863,Lys Mousset - 5,Dean Henderson,,1.182353,0.306557
8,Wolves,17,5,9,3,24,21,3,24,21.9,17.2,4.7,0.28,W D W D L,31237,RaÃºl JimÃ©nez - 6,Rui PatrÃ­cio,,1.011765,0.363577
9,Crystal Palace,17,6,5,6,15,19,-4,23,15.3,24.1,-8.7,-0.51,L W W D D,25013,Jordan Ayew - 4,Vicente Guaita,,1.417647,0.242283
10,Arsenal,17,5,7,5,24,27,-3,22,24.3,26.6,-2.2,-0.13,D D L W L,60269,Pierre-Emerick Aubameyang - 11,Bernd Leno,,1.564706,0.20915


## xG-adjusted points

Next, we determine for each player their 'adjusted points'. To do this, we first subtract for each player all the points they have accumulated through goals, assists and clean sheets. Then, we add points for each player based on their expected goals, assists and clean sheets. This gives a much improved estimate of each player's true point generating capability. 

In [7]:
def incorporate_xG(indicator, ix):
    xG = playerStats.loc[indicator, 'xG'].values[0]
    if df.loc[ix, 'element_type']<=2:
        df.loc[ix, 'adjusted points'] =  df.loc[ix, 'total_points'] -  6 * (df.loc[ix, 'goals_scored'] - xG)
    elif df.loc[ix, 'element_type']==3:
        df.loc[ix, 'adjusted points'] =  df.loc[ix, 'total_points'] -  5 * (df.loc[ix, 'goals_scored'] - xG)
    elif df.loc[ix, 'element_type']==4:
        df.loc[ix, 'adjusted points'] =  df.loc[ix, 'total_points'] -  4 * (df.loc[ix, 'goals_scored'] - xG)

In [8]:
# always run 'team_xGA' AFTER 'incorporate_xG'
def team_xGA(indicator, ix):
    team = team_names[df.loc[ix, 'team']-1]
    clean_sheets = df.loc[ix, 'clean_sheets']
    probability_cleanSheet = teamStats.loc[teamStats['Squad']==team, 'probability no goals allowed'].values[0]
    if df.loc[ix, 'element_type']<=2:
        df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] -  \
                    4 * (df.loc[ix, 'clean_sheets'] - df.loc[ix, 'games played']*probability_cleanSheet)
    elif df.loc[ix, 'element_type']==3:
        df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] -  \
                    (df.loc[ix, 'clean_sheets'] - df.loc[ix, 'games played']*probability_cleanSheet)

A player who gives an assist that directly leads to a shot, is assigned the xG-value of the shot in xA (expected assists), i.e. xA is a measure of 'goal assists'. In FPL, however, the definition of an assist is somewhat more relaxed, e.g. goals resulting from a rebound of parried shot will award an assist to the player making the initial shot. For this reason, we calculate the total number of assists awarded in FPL and the total sum of xA of all players and get an estimate of the proportion of assists that xA covers in FPL. Then, we modify that proportion of players assists based on their xA.

In [9]:
# always run 'xA' AFTER 'incorporate_xG'
def xA(indicator, ix):
    xA = playerStats.loc[indicator, 'xA'].values[0]
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] -  3 * (xA_proportion*df.loc[ix, 'assists'] - xA)

In [10]:
team_names = np.sort(playerStats['Squad'].unique())
xA_proportion = playerStats['xA'].sum()/df['assists'].sum()
df['points_per_game'] = df['points_per_game'].astype(float)
df['games played'] = df['total_points']/df['points_per_game']
#df['games played'] = df['minutes']/90.0
xA_proportion

0.7849056603773585

## Main loop for assigning adjusted points

Below is the main loop where we calculate adjusted points for each player. Calculation of the adjusted points itself is straightforward, but there is some work required to match players in two different data sets. Comparing player names in both data sets gives unique matches in many cases, but some special cases need to be covered through individual solutions.

In [11]:
for ix in df[df['minutes']>0].index:    
    name = df.loc[ix, 'web_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u').replace('ö', 'o').\
                        replace('ä', 'a')
    indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains(name)
    if playerStats.loc[indicator].shape[0]==1:
        incorporate_xG(indicator, ix)
        team_xGA(indicator, ix)
        xA(indicator, ix)
    elif playerStats.loc[indicator].shape[0]==0:
        first_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u')\
                                                                    .replace('ö', 'o').replace('ä', 'a')
        first_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(first_name)
        if playerStats.loc[first_name_indicator].shape[0]==1:
            incorporate_xG(first_name_indicator, ix)
            team_xGA(first_name_indicator, ix)
            xA(first_name_indicator, ix)
        else:
            names = ['rodrigo','garcia','chicharito']
            names_playerStats_index = [76, 380, 335]
            if name in names:
                name_ix = names.index(name)
                exceptional_case_indicator = playerStats.index == names_playerStats_index[name_ix]
                incorporate_xG(exceptional_case_indicator, ix)
                team_xGA(exceptional_case_indicator, ix)
                xA(exceptional_case_indicator, ix)
            else:
                print(str(ix) + ': no player found.')
    elif playerStats.loc[indicator].shape[0]>1:
        full_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '') \
            + df.loc[ix, 'second_name'].lower().replace(' ', '').replace('-', '')
        full_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(full_name)
        if playerStats.loc[full_name_indicator].shape[0]==1:
            incorporate_xG(full_name_indicator, ix)
            team_xGA(full_name_indicator, ix)
            xA(full_name_indicator, ix)
        else:
            team = team_names[df.loc[ix, 'team']-1]
            team_indicator = playerStats['Squad']==team
            if playerStats.loc[indicator & team_indicator].shape[0]==1:
                incorporate_xG(indicator & team_indicator, ix)
                team_xGA(indicator & team_indicator, ix)
                xA(indicator & team_indicator, ix)
            else:
                if name=='son':
                    exceptional_case_indicator = playerStats.index == 176
                    incorporate_xG(exceptional_case_indicator, ix)
                    team_xGA(exceptional_case_indicator, ix)
                    xA(exceptional_case_indicator, ix)
                else:
                    print(str(ix) + ': non-unique name.')
    else:
        print(str(ix) + 'Player not found')

In [12]:
df['adjusted points per game'] = df['adjusted points'] / df['games played']

In [13]:
# give a sorted list showing the players with highest 'adjusted points per game'
df[['web_name', 'games played','total_points', 'points_per_game','adjusted points','adjusted points per game']]\
                                .sort_values(by='adjusted points per game', ascending=False)

Unnamed: 0_level_0,web_name,games played,total_points,points_per_game,adjusted points,adjusted points per game
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
191,Salah,14.054054,104,7.4,95.990860,6.830119
215,De Bruyne,16.111111,116,7.2,104.660027,6.496140
166,Vardy,17.051282,133,7.8,107.526415,6.306060
192,Mané,15.972222,115,7.2,100.124111,6.268640
281,McGovern,2.000000,11,5.5,12.419110,6.209555
214,Sterling,15.964912,91,5.7,98.553096,6.173106
233,Rashford,16.885246,103,6.1,103.626415,6.137098
167,Iheanacho,3.013699,22,7.3,18.190566,6.035961
460,Abraham,15.967742,99,6.2,90.535849,5.669922
210,Agüero,12.000000,72,6.0,67.735849,5.644654


In [14]:
# save data
filepath = '../data/data_week' + str(latest_gameweek) + str('.csv')
df.to_csv(filepath)

filepath = '../data/fbref/team_stats_week' + str(latest_gameweek) + '.csv'
teamStats.to_csv(filepath)

Below we check how well the total xG matches the total scored goals.

In [15]:
playerStats['xG'].sum()

482.09999999999997

In [16]:
df['goals_scored'].sum()

471

In [17]:
playerStats['xG'].sum()/df['goals_scored'].sum()

1.0235668789808916