# DATA PROCESSING AND CALCULATION OF xG-ADJUSTED FPL POINTS

## Setup and pre-processing

In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import json
import requests
from scipy.stats import poisson

# allow more data columns to be shown than by default
pd.set_option('max_columns',100)

In [2]:
# import player data 
filepath = '../../data/fbref/player_stats_season19_20.csv'
playerStats = pd.read_csv(filepath, index_col=0, skiprows=1)
playerStats.head()

Unnamed: 0_level_0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,Gls,Ast,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK,G+A-PK,xG,npxG,xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA,Matches
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,Patrick van Aanholt\Patrick-van-Aanholt,nl NED,DF,Crystal Palace,28.0,1990.0,29,29,2507,3,2,1,1,0,0,0.11,0.07,0.18,0.07,0.14,2.4,1.6,3.1,0.08,0.11,0.2,0.06,0.17,Matches
2,Max Aarons\Max-Aarons,eng ENG,DF,Norwich City,19.0,2000.0,36,36,3240,0,1,0,0,7,0,0.0,0.03,0.03,0.0,0.03,0.5,0.5,3.4,0.01,0.09,0.11,0.01,0.11,Matches
3,Tammy Abraham\Tammy-Abraham,eng ENG,FW,Chelsea,21.0,1997.0,34,25,2215,15,3,0,0,2,0,0.61,0.12,0.73,0.61,0.73,14.6,14.6,2.6,0.59,0.1,0.7,0.59,0.7,Matches
4,Che Adams\Che-Adams,eng ENG,FW,Southampton,23.0,1996.0,30,12,1111,4,2,0,0,0,0,0.32,0.16,0.49,0.32,0.49,4.2,4.2,1.3,0.34,0.1,0.44,0.34,0.44,Matches
5,Adrián\Adrian,es ESP,GK,Liverpool,32.0,1987.0,11,9,875,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [4]:
# import team data and pre-process
filepath = '../../data/fbref/team_stats_season19_20_original.csv'
teamStats = pd.read_csv(filepath, index_col=0)

# change team names to match convention used in the FPL data
teamStats.loc[teamStats['Squad']=='Brighton & Hove Albion','Squad'] = 'Brighton'
teamStats.loc[teamStats['Squad']=='Manchester United','Squad'] = 'Manchester Utd'
teamStats.loc[teamStats['Squad']=='Newcastle United','Squad'] = 'Newcastle Utd'
teamStats.loc[teamStats['Squad']=='Sheffield United','Squad'] = 'Sheffield Utd'
teamStats.loc[teamStats['Squad']=='West Ham United','Squad'] = 'West Ham'
teamStats.loc[teamStats['Squad']=='Tottenham Hotspur','Squad'] = 'Tottenham'
teamStats.loc[teamStats['Squad']=='Wolverhampton Wanderers','Squad'] = 'Wolves'

teamStats.head()

Unnamed: 0_level_0,Squad,MP,W,D,L,GF,GA,GDiff,Pts,xG,xGA,xGDiff,xGDiff/90,Attendance,Top Team Scorer,Goalkeeper,Notes
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Liverpool,38,32,3,3,85,33,52,99,70.7,40.1,30.6,0.81,41955,Mohamed Salah - 19,Alisson,→ UEFA Champions League via league finish
2,Manchester City,38,26,3,9,102,35,67,81,93.2,34.2,59.0,1.55,37097,Raheem Sterling - 20,Ederson,→ UEFA Champions League via league finish
3,Manchester Utd,38,18,12,8,66,36,30,66,59.8,37.4,22.4,0.59,57415,Marcus Rashford Anthony Martial - 17,David de Gea,→ UEFA Champions League via league finish
4,Chelsea,38,20,6,12,69,54,15,66,66.7,37.3,29.4,0.77,32023,Tammy Abraham - 15,Kepa Arrizabalaga,→ UEFA Champions League via league finish
5,Leicester City,38,18,8,12,67,41,26,62,60.7,44.8,16.0,0.42,25312,Jamie Vardy - 23,Kasper Schmeichel,→ UEFA Europa League via league finish


In [6]:
# fetch FPL data online
data = json.loads(requests.get('https://fantasy.premierleague.com/api/bootstrap-static/').text)
df = pd.DataFrame(data['elements'])
df.set_index('id',inplace=True)

# fetch FPL data from season 19-20
filepath = '../../data/fpl/data_season19_20.csv'
df19_20 = pd.read_csv(filepath, index_col=0)

display(df.head())
display(df19_20.head())

Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_saved,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,squad_number,status,team,team_code,threat,threat_rank,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1,3,1,256,,,5,37605,0,0,0,0,582.9,35,29,0,3,3.4,,0,Mesut,0.0,20,1,99.8,135,72,False,223.6,256,111,1439,,,70,0,0,0,37605.jpg,2.9,0,0,Özil,2.0,False,,a,1,3,190.0,175,96,53,0,0,0,0,0.0,7.6,Özil,1
2,0,5,305,,,4,39476,0,0,0,0,36.8,297,106,0,2,3.1,,0,Sokratis,0.0,25,2,58.5,235,74,False,436.2,144,53,1696,,,50,0,0,0,39476.jpg,3.0,0,0,Papastathopoulos,0.5,False,,a,1,3,110.0,225,65,57,0,0,0,0,0.0,11.4,Sokratis,6
3,1,10,494,,,8,41270,0,0,0,0,106.7,223,61,0,2,3.5,,0,David,0.0,42,2,102.1,130,35,False,701.6,50,18,2809,,,55,0,0,0,41270.jpg,2.8,2,0,Luiz Moreira Marinho,2.7,False,,a,1,3,211.0,161,32,94,0,0,0,0,0.0,17.1,David Luiz,5
4,5,37,807,,,10,54694,0,0,0,0,479.6,54,42,0,3,5.0,,0,Pierre-Emerick,0.0,44,22,285.2,11,7,False,1006.0,8,4,3136,,,120,0,0,0,54694.jpg,5.7,1,0,Aubameyang,42.2,False,,a,1,3,1369.0,9,3,205,0,0,0,0,0.0,17.1,Aubameyang,3
5,1,3,286,,,4,58822,0,0,0,0,218.9,159,34,0,2,3.1,,0,Cédric,0.0,20,1,68.7,203,59,False,349.0,182,68,1553,,,50,0,0,0,58822.jpg,2.9,0,0,Soares,0.6,False,,a,1,3,118.0,224,64,61,0,0,0,0,0.0,12.2,Cédric,1


Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_saved,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,squad_number,status,team,team_code,threat,threat_rank,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards,games played,adjusted points,adjusted points per game,form 5,form 10,value,valuePoints metric,team_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
1,2,2,242,0.0,0.0,4,69140,0,0,-4,4,45.5,352,122,0,2,0.0,0.0,0,Shkodran,1.5,17,0,47.9,313,106,False,277.2,257,100,1205,Hamstring injury - Expected back 31 Oct,2020-07-18T22:30:20.553834Z,51,0,0,0,69140.jpg,2.9,0,0,Mustafi,0.4,False,,i,1,3,155.0,234,58,43,25007,0,47630,0,0.3,8.4,Mustafi,2,14.827586,40.80923,2.75225,2.27491,2.515892,0.539657,1.218717,Arsenal
2,0,4,204,100.0,100.0,4,98745,0,0,-3,3,76.9,301,95,0,2,2.3,1.2,0,Héctor,0.7,18,1,37.0,339,122,False,187.8,322,124,1156,,2019-12-09T20:00:21.228098Z,52,0,0,0,98745.jpg,2.9,0,0,Bellerín,1.0,False,,a,1,3,103.0,275,81,44,206616,0,159819,0,0.1,8.5,Bellerín,2,15.172414,39.502783,2.603593,2.971828,3.060159,0.500691,1.141751,Arsenal
3,2,1,331,100.0,100.0,4,111457,0,0,-3,3,182.5,215,52,0,2,3.3,2.5,1,Sead,2.0,26,0,53.0,293,96,False,269.6,264,104,1694,,2020-02-23T18:30:13.672943Z,52,0,0,0,111457.jpg,2.1,0,0,Kolasinac,0.5,False,,a,1,3,81.0,305,98,55,65194,0,134275,0,0.4,10.6,Kolasinac,4,26.190476,66.221271,2.528449,1.919909,2.241304,0.48624,1.108798,Arsenal
4,2,3,244,100.0,100.0,3,154043,0,0,-5,5,182.0,216,53,1,2,0.8,1.0,1,Ainsley,0.5,22,0,53.6,287,94,False,301.8,241,94,1382,,2019-09-22T18:00:10.824841Z,45,0,0,0,154043.jpg,2.0,1,0,Maitland-Niles,2.2,False,,a,1,3,58.0,337,119,41,610816,0,653555,0,0.1,9.1,Maitland-Niles,4,20.5,49.506975,2.414974,2.075312,1.862118,0.536661,1.13843,Arsenal
5,0,5,305,100.0,100.0,4,39476,0,0,-2,2,36.8,365,130,1,2,0.5,0.5,0,Sokratis,0.0,25,2,58.5,275,88,False,436.2,166,62,1696,,2020-06-18T18:00:15.974146Z,48,0,0,0,39476.jpg,3.0,0,0,Papastathopoulos,1.3,False,,a,1,3,110.0,266,76,57,182201,0,231413,0,0.0,11.9,Sokratis,6,19.0,53.610076,2.821583,,3.519601,0.58783,1.287871,Arsenal


In [7]:
# assign proper team names for each player
team_names = np.array(['Arsenal', 'Aston Villa', 'Brighton', 'Burnley',
       'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leicester City', 'Leeds',
       'Liverpool', 'Manchester City', 'Manchester Utd', 'Newcastle Utd', 'Sheffield Utd', 
       'Southampton', 'Tottenham', 'West Brom', 'West Ham', 'Wolves'], dtype=object)
df['team_name'] = team_names[df['team']-1]
df.head()

Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_saved,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,squad_number,status,team,team_code,threat,threat_rank,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards,team_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1,3,1,256,,,5,37605,0,0,0,0,582.9,35,29,0,3,3.4,,0,Mesut,0.0,20,1,99.8,135,72,False,223.6,256,111,1439,,,70,0,0,0,37605.jpg,2.9,0,0,Özil,2.0,False,,a,1,3,190.0,175,96,53,0,0,0,0,0.0,7.6,Özil,1,Arsenal
2,0,5,305,,,4,39476,0,0,0,0,36.8,297,106,0,2,3.1,,0,Sokratis,0.0,25,2,58.5,235,74,False,436.2,144,53,1696,,,50,0,0,0,39476.jpg,3.0,0,0,Papastathopoulos,0.5,False,,a,1,3,110.0,225,65,57,0,0,0,0,0.0,11.4,Sokratis,6,Arsenal
3,1,10,494,,,8,41270,0,0,0,0,106.7,223,61,0,2,3.5,,0,David,0.0,42,2,102.1,130,35,False,701.6,50,18,2809,,,55,0,0,0,41270.jpg,2.8,2,0,Luiz Moreira Marinho,2.7,False,,a,1,3,211.0,161,32,94,0,0,0,0,0.0,17.1,David Luiz,5,Arsenal
4,5,37,807,,,10,54694,0,0,0,0,479.6,54,42,0,3,5.0,,0,Pierre-Emerick,0.0,44,22,285.2,11,7,False,1006.0,8,4,3136,,,120,0,0,0,54694.jpg,5.7,1,0,Aubameyang,42.2,False,,a,1,3,1369.0,9,3,205,0,0,0,0,0.0,17.1,Aubameyang,3,Arsenal
5,1,3,286,,,4,58822,0,0,0,0,218.9,159,34,0,2,3.1,,0,Cédric,0.0,20,1,68.7,203,59,False,349.0,182,68,1553,,,50,0,0,0,58822.jpg,2.9,0,0,Soares,0.6,False,,a,1,3,118.0,224,64,61,0,0,0,0,0.0,12.2,Cédric,1,Arsenal


## Probability to keep a clean sheet

Here, we estimate for each team the probability that the team keeps a clean sheet (against average opposition). We do this by first calculating the expected goals allowed per game for each team. Then, we assume that conceding goals follows a Poisson distribution, from which we then get the desired probability.

In [8]:
teamStats['xG per game'] = teamStats['xG'] / teamStats['MP']
teamStats['xGA per game'] = teamStats['xGA'] / teamStats['MP']
teamStats['probability no goals allowed'] = poisson.pmf(0,teamStats['xGA per game'])
teamStats

Unnamed: 0_level_0,Squad,MP,W,D,L,GF,GA,GDiff,Pts,xG,xGA,xGDiff,xGDiff/90,Attendance,Top Team Scorer,Goalkeeper,Notes,xG per game,xGA per game,probability no goals allowed
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Liverpool,38,32,3,3,85,33,52,99,70.7,40.1,30.6,0.81,41955,Mohamed Salah - 19,Alisson,→ UEFA Champions League via league finish,1.860526,1.055263,0.348101
2,Manchester City,38,26,3,9,102,35,67,81,93.2,34.2,59.0,1.55,37097,Raheem Sterling - 20,Ederson,→ UEFA Champions League via league finish,2.452632,0.9,0.40657
3,Manchester Utd,38,18,12,8,66,36,30,66,59.8,37.4,22.4,0.59,57415,Marcus Rashford Anthony Martial - 17,David de Gea,→ UEFA Champions League via league finish,1.573684,0.984211,0.373734
4,Chelsea,38,20,6,12,69,54,15,66,66.7,37.3,29.4,0.77,32023,Tammy Abraham - 15,Kepa Arrizabalaga,→ UEFA Champions League via league finish,1.755263,0.981579,0.374719
5,Leicester City,38,18,8,12,67,41,26,62,60.7,44.8,16.0,0.42,25312,Jamie Vardy - 23,Kasper Schmeichel,→ UEFA Europa League via league finish,1.597368,1.178947,0.307602
6,Tottenham,38,16,11,11,61,47,14,59,45.5,51.9,-6.3,-0.17,43757,Harry Kane - 18,Hugo Lloris,→ UEFA Europa League via league finish 1,1.197368,1.365789,0.255179
7,Wolves,38,15,14,9,51,40,11,59,47.0,34.4,12.5,0.33,24758,Raúl Jiménez - 17,Rui Patrício,,1.236842,0.905263,0.404435
8,Arsenal,38,14,14,10,56,48,8,56,49.1,56.1,-7.1,-0.19,47589,Pierre-Emerick Aubameyang - 22,Bernd Leno,→ UEFA Europa League via cup win 2,1.292105,1.476316,0.228478
9,Sheffield Utd,38,14,12,12,39,39,0,54,41.5,47.8,-6.3,-0.17,24370,Oliver McBurnie Lys Mousset - 6,Dean Henderson,,1.092105,1.257895,0.284252
10,Burnley,38,15,9,14,43,50,-7,54,43.7,47.8,-4.1,-0.11,15995,Chris Wood - 14,Nick Pope,,1.15,1.257895,0.284252


## xG-adjusted points

Next, we determine for each player their 'adjusted points'. To do this, we first subtract for each player all the points they have accumulated through goals, assists and clean sheets. Then, we add points for each player based on their expected goals, assists and clean sheets. This gives a much improved estimate of each player's true point generating capability. 

In [9]:
# get players this year who can be identified from last year with first and last name
# get index now, position they played last year, number_of_matched_names
name_matching = []
for ix in df[df['minutes']>0].index:
    first_name = df.loc[ix,'first_name']
    last_name = df.loc[ix,'second_name']
    position19_20 = df19_20.loc[(df19_20['first_name']==first_name) & \
                                (df19_20['second_name']==last_name),'element_type'].values[0]
    number_of_matches = df19_20[(df19_20['first_name']==first_name) & (df19_20['second_name']==last_name)].shape[0]
    name_matching.append([ix, position19_20, number_of_matches])
    
name_matching = np.array(name_matching)
mask = name_matching[:,2] == 1
player_ix_list = name_matching[mask][:,0]
display(player_ix_list)
display('Total: ' + str(player_ix_list.shape[0]))

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
       478, 501,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        68,  69,  70,  71,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  95,  96,  97,
        98, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
       112, 113, 115, 116, 118, 119, 120, 121, 122, 123, 124, 232, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 146, 147, 486, 148, 149, 150, 152,
       153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166,
       168, 169, 170, 216, 217, 218, 219, 221, 222, 224, 225, 226, 228,
       229, 230, 231, 233, 234, 235, 236, 238, 239, 240, 241, 24

'Total: 382'

In [10]:
def points_for_goal(position):
    if position < 3:
        return 6
    elif position==3:
        return 5
    elif position==4:
        return 4

In [11]:
def points_for_clean_sheet(position):
    if position < 3:
        return 4
    elif position==3:
        return 1
    elif position==4:
        return 0

In [12]:
# this now takes into account if player played a different position last year than now
def incorporate_xG(indicator, ix):
    xG = playerStats.loc[indicator, 'xG'].values[0]  
    penalty_attempts = playerStats.loc[indicator,'PKatt'].values[0]
    mask = name_matching[:,0] == ix
    position19_20 = name_matching[mask][0,1]    
    position_now = df.loc[ix, 'element_type']
    
    # 0.24 is the probability to miss a penalty, which incurs -2 points
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'total_points'] - \
                points_for_goal(position19_20)*df.loc[ix, 'goals_scored'] + \
                points_for_goal(position_now)*xG - 2 * 0.24 * penalty_attempts
    
    df.loc[ix, 'xG'] = xG
    df.loc[ix, 'xG_points'] = points_for_goal(position_now)*xG - 2 * 0.24 * penalty_attempts

In [13]:
# always run 'team_xGA' AFTER 'incorporate_xG'
def team_xGA(indicator, ix):
    team = team_names[df.loc[ix, 'team']-1]
    clean_sheets = df.loc[ix, 'clean_sheets']
    probability_cleanSheet = teamStats.loc[teamStats['Squad']==team, 'probability no goals allowed'].values[0]
    mask = name_matching[:,0] == ix
    position19_20 = name_matching[mask][0,1]
    position_now = df.loc[ix, 'element_type']
    
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] - \
                        points_for_clean_sheet(position19_20)*df.loc[ix, 'clean_sheets'] + \
                        points_for_clean_sheet(position_now)*df.loc[ix, 'games played'] * \
                        probability_cleanSheet
    df.loc[ix, 'clean_sheet_points'] = points_for_clean_sheet(position_now) * \
                                       df.loc[ix, 'games played'] * probability_cleanSheet

A player who gives an assist that directly leads to a shot, is assigned the xG-value of the shot in xA (expected assists), i.e. xA is a measure of 'goal assists'. In FPL, however, the definition of an assist is somewhat more relaxed, e.g. goals resulting from a rebound of parried shot will award an assist to the player making the initial shot. For this reason, we calculate the total number of assists awarded in FPL and the total sum of xA of all players and get an estimate of the proportion of assists that xA covers in FPL. Then, we modify that proportion of players assists based on their xA.

In [14]:
# always run 'xA' AFTER 'incorporate_xG'
def xA(indicator, ix):
    xA = playerStats.loc[indicator, 'xA'].values[0]
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] -  3 * (xA_proportion*df.loc[ix, 'assists'] - xA)
    df.loc[ix, 'xA'] = xA
    df.loc[ix, 'xA_points'] = 3*xA

In [15]:
def calculateAdjustedPoints(indicator, index):
    incorporate_xG(indicator, index)
    team_xGA(indicator, index)
    xA(indicator, index)

In [16]:
df['points_per_game'] = df['points_per_game'].astype(float)
df['games played'] = df['total_points']/df['points_per_game'] #takes into account playing less than 90 minutes per game

xA_proportion = 0.7629300776914539 # from last season

## Main loop for assigning adjusted points

Below is the main loop where we calculate adjusted points for each player. Calculation of the adjusted points itself is straightforward, but there is some work required to match players in two different data sets. Comparing player names in both data sets gives unique matches in many cases, but some special cases need to be covered through individual solutions.

In [17]:
for ix in player_ix_list: 
    # player name in FPL data
    name = df.loc[ix, 'web_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u').replace('ö', 'o').\
                        replace('ä', 'a')
    # find FBREF data indexes where player name contains 'name'
    indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains(name)
    # if unique match is found, we can calculate 'adjusted points'
    if playerStats.loc[indicator].shape[0]==1:
        calculateAdjustedPoints(indicator, ix)
    # if no match is found
    elif playerStats.loc[indicator].shape[0]==0:
        # try matching based on first name, helps in some cases
        first_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u')\
                                                                    .replace('ö', 'o').replace('ä', 'a')
        first_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(first_name)
        # if unique match is found, we can calculate 'adjusted points'
        if playerStats.loc[first_name_indicator].shape[0]==1:
            calculateAdjustedPoints(first_name_indicator, ix)
        # here we deal with some special cases individually
        else:
            names = ['rodrigo','elliott','mattylongstaff']
            if name == 'rodrigo':
                exceptional_case_indicator = \
                (playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains('rodri')) & \
                (~playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains('rodriguez'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == 'elliott':
                exceptional_case_indicator = (playerStats['Player'].str.lower().str.replace(' ', '')\
                                              .str.replace('-', '').str.contains('elliot'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == 'mattylongstaff':
                exceptional_case_indicator = (playerStats['Player'].str.lower().str.contains('matthew longstaff'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            else:
                print(str(ix) + ': no player found.')
    # if more than one matches were found in the original comparison
    elif playerStats.loc[indicator].shape[0]>1:
        # try matching full name
        full_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '') \
            + df.loc[ix, 'second_name'].lower().replace(' ', '').replace('-', '')
        full_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(full_name)
        # if unique match is found, we can calculate 'adjusted points'
        if playerStats.loc[full_name_indicator].shape[0]==1:
            calculateAdjustedPoints(full_name_indicator, ix)
        # here we try to match players based both on their name and team
        else:
            team = team_names[df.loc[ix, 'team']-1]
            team_indicator = playerStats['Squad']==team
            if playerStats.loc[indicator & team_indicator].shape[0]==1:
                calculateAdjustedPoints(indicator & team_indicator, ix)
            # special case
            else:
                if name=='son':
                    exceptional_case_indicator = playerStats['Player'].str.lower().str.replace(' ', '')\
                                                .str.replace('-', '').str.contains('heung')
                    calculateAdjustedPoints(exceptional_case_indicator, ix)
                else:
                    print(str(ix) + ': non-unique name.')
    else:
        print(str(ix) + 'Player not found')

In [18]:
df['adjusted points per game'] = df['adjusted points'] / df['games played']

In [19]:
# give a sorted list showing the players with highest 'adjusted points per game'
df[['web_name', 'games played','total_points', 'points_per_game','adjusted points','adjusted points per game']]\
                                .sort_values(by='adjusted points per game', ascending=False)

Unnamed: 0_level_0,web_name,games played,total_points,points_per_game,adjusted points,adjusted points per game
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
254,Salah,33.768116,233,6.9,226.226806,6.699420
302,Fernandes,13.928571,117,8.4,90.275261,6.481301
306,Rashford,31.052632,177,5.7,200.255107,6.448893
272,De Bruyne,34.861111,251,7.2,221.771295,6.361567
276,Sterling,32.903226,204,6.2,197.484712,6.001986
251,Mané,35.079365,221,6.3,196.823253,5.610799
224,Vardy,35.000000,210,6.0,194.978468,5.570813
4,Aubameyang,35.964912,205,5.7,200.313236,5.569685
259,Alexander-Arnold,38.181818,210,5.5,195.332634,5.115855
240,Thomas,3.000000,15,5.0,15.102438,5.034146


In [21]:
# save data
filepath = '../../data/fpl/data_week0.csv'
df.to_csv(filepath)

filepath = '../../data/fbref/team_stats_season19_20.csv'
teamStats.to_csv(filepath)