# DATA PROCESSING AND CALCULATION OF xG-ADJUSTED FPL POINTS

## Setup and pre-processing

In [1]:
# give the number of the latest FPL round
latest_gameweek = 38

In [2]:
# import basic libraries
import pandas as pd
import numpy as np
import json
import requests
from scipy.stats import poisson

# allow more data columns to be shown than by default
pd.set_option('max_columns',100)

In [3]:
# import player data 
filepath = '../data/fbref/player_stats_week' + str(latest_gameweek) + '.csv'
playerStats = pd.read_csv(filepath, index_col=0, skiprows=1)
playerStats.head()

Unnamed: 0_level_0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1,Matches
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1,Patrick van Aanholt\Patrick-van-Aanholt,nl NED,DF,Crystal Palace,29.0,1990.0,22,20,1777,19.7,0,1,0,0,0,2,0,0.0,0.05,0.05,0.0,0.05,1.2,1.2,0.8,2.0,0.06,0.04,0.1,0.06,0.1,Matches
2,Tammy Abraham\Tammy-Abraham,eng ENG,FW,Chelsea,22.0,1997.0,22,12,1040,11.6,6,1,6,0,0,0,0,0.52,0.09,0.61,0.52,0.61,6.4,6.4,0.9,7.3,0.56,0.07,0.63,0.56,0.63,Matches
3,Che Adams\Che-Adams,sco SCO,FW,Southampton,24.0,1996.0,36,30,2667,29.6,9,5,9,0,0,1,0,0.3,0.17,0.47,0.3,0.47,8.8,8.8,5.8,14.5,0.3,0.19,0.49,0.3,0.49,Matches
4,Tosin Adarabioyo\Tosin-Adarabioyo,eng ENG,DF,Fulham,22.0,1997.0,33,33,2953,32.8,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,1.1,1.1,0.4,1.5,0.03,0.01,0.04,0.03,0.04,Matches
5,Adrián\Adrian,es ESP,GK,Liverpool,33.0,1987.0,3,3,270,3.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [4]:
# import team data and pre-process
filepath = '../data/fbref/team_stats_week' + str(latest_gameweek) + '.csv'
teamStats = pd.read_csv(filepath, index_col='Squad')
teamStats.head()

Unnamed: 0_level_0,Rk,MP,W,D,L,GF,GA,GD,Pts,xG,xGA,xGD,xGD/90,Attendance,Top Team Scorer,Goalkeeper,Notes
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Manchester City,1,38,27,5,6,83,32,51,86,73.3,31.4,42.0,1.1,526,İlkay Gündoğan - 13,Ederson,→ UEFA Champions League via league finish
Manchester Utd,2,38,21,11,6,73,44,29,74,60.2,42.2,18.0,0.47,526,Bruno Fernandes - 18,David de Gea,→ UEFA Champions League via league finish
Liverpool,3,38,20,9,9,68,42,26,69,72.6,45.3,27.3,0.72,837,Mohamed Salah - 22,Alisson,→ UEFA Champions League via league finish
Chelsea,4,38,19,10,9,58,36,22,67,64.0,32.8,31.2,0.82,526,Jorginho - 7,Edouard Mendy,→ UEFA Champions League via league finish
Leicester City,5,38,20,6,12,68,50,18,66,56.0,47.7,8.3,0.22,421,Jamie Vardy - 15,Kasper Schmeichel,→ UEFA Europa League via cup win


In [5]:
# fetch FPL data online
data = json.loads(requests.get('https://fantasy.premierleague.com/api/bootstrap-static/').text)
df = pd.DataFrame(data['elements'])
df.set_index('id',inplace=True)
# this is used later
df_online=1

# fetch data locally
#df = pd.read_csv('../data/fpl/data_week' + str(latest_gameweek) + '.csv', index_col=0)
#df_online=0

df.head()

Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,corners_and_indirect_freekicks_order,corners_and_indirect_freekicks_text,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,direct_freekicks_text,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,penalties_text,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,squad_number,status,team,team_code,threat,threat_rank,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
1,0,0,0,0.0,0.0,0,37605,,,0,0,-3,3,0.0,713,295,,,0,3,0.0,0.0,0,Mesut,0.0,0,0,0.0,713,295,False,0.0,713,295,0,Not included in Arsenal's 25-man Premier Leagu...,2020-10-20T22:30:18.118477Z,67,0,0,,0,,37605.jpg,0.0,0,0,Özil,0.5,False,,u,1,3,0.0,713,295,0,3441,0,54937,0,0.0,0.0,Özil,0
2,0,0,0,0.0,0.0,0,39476,,,0,0,-2,2,0.0,674,241,,,0,2,0.0,0.0,0,Sokratis,0.0,0,0,0.0,685,240,False,0.0,682,240,0,Left the club by mutual consent on 20/1,2020-10-21T10:30:18.546407Z,48,0,0,,0,,39476.jpg,0.0,0,0,Papastathopoulos,0.1,False,,u,1,3,0.0,667,238,0,10266,0,19142,0,0.0,0.0,Sokratis,0
3,0,0,230,100.0,50.0,3,41270,,,0,0,-1,1,46.7,346,119,4.0,,0,2,0.3,0.2,0,David,0.0,20,1,43.9,310,105,False,250.6,264,101,1396,,2021-05-02T16:00:26.134258Z,54,0,0,,0,,41270.jpg,2.0,1,0,Luiz Moreira Marinho,0.8,False,,a,1,3,144.0,244,70,41,77302,0,131045,0,0.0,7.6,David Luiz,1
4,3,11,358,100.0,100.0,10,54694,,,0,0,-7,7,361.5,101,71,,,3,3,4.5,3.0,3,Pierre-Emerick,2.5,26,10,167.4,52,31,False,493.8,136,49,2330,,2021-04-12T08:30:26.681398Z,113,1,0,1.0,0,,54694.jpg,4.5,0,0,Aubameyang,7.2,False,,a,1,3,823.0,34,18,131,1240132,0,3870817,0,0.2,11.6,Aubameyang,2
5,1,3,125,100.0,100.0,2,58822,,,0,0,-4,4,114.8,249,65,5.0,,0,2,0.5,0.5,0,Cédric,0.0,11,0,29.2,356,125,False,110.8,366,140,744,,2020-09-23T09:00:14.881983Z,46,0,0,,0,,58822.jpg,2.8,0,0,Soares,0.3,False,,a,1,3,66.0,330,110,28,41602,0,66906,0,0.0,6.1,Cédric,1


In [6]:
# assign proper team names for each player
team_names = np.sort(teamStats.index)
# for some reason the fpl team numbers are not alphabetical with leeds and leicester...
if (team_names[8]=='Leeds United') & (team_names[9]=='Leicester City'):
    team_names[8] = 'Leicester City'
    team_names[9] = 'Leeds United'
df['team_name'] = team_names[df['team']-1]

## Probability to keep a clean sheet

Here, we estimate for each team the probability that the team keeps a clean sheet (against average opposition). We do this by first calculating the expected goals allowed per game for each team. Then, we assume that conceding goals follows a Poisson distribution, from which we then get the desired probability.

In [7]:
teamStats['xG per game'] = teamStats['xG'] / teamStats['MP']
teamStats['xGA per game'] = teamStats['xGA'] / teamStats['MP']
teamStats['probability no goals allowed'] = poisson.pmf(0,teamStats['xGA per game'])
teamStats

Unnamed: 0_level_0,Rk,MP,W,D,L,GF,GA,GD,Pts,xG,xGA,xGD,xGD/90,Attendance,Top Team Scorer,Goalkeeper,Notes,xG per game,xGA per game,probability no goals allowed
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Manchester City,1,38,27,5,6,83,32,51,86,73.3,31.4,42.0,1.1,526,İlkay Gündoğan - 13,Ederson,→ UEFA Champions League via league finish,1.928947,0.826316,0.437659
Manchester Utd,2,38,21,11,6,73,44,29,74,60.2,42.2,18.0,0.47,526,Bruno Fernandes - 18,David de Gea,→ UEFA Champions League via league finish,1.584211,1.110526,0.329386
Liverpool,3,38,20,9,9,68,42,26,69,72.6,45.3,27.3,0.72,837,Mohamed Salah - 22,Alisson,→ UEFA Champions League via league finish,1.910526,1.192105,0.303581
Chelsea,4,38,19,10,9,58,36,22,67,64.0,32.8,31.2,0.82,526,Jorginho - 7,Edouard Mendy,→ UEFA Champions League via league finish,1.684211,0.863158,0.421828
Leicester City,5,38,20,6,12,68,50,18,66,56.0,47.7,8.3,0.22,421,Jamie Vardy - 15,Kasper Schmeichel,→ UEFA Europa League via cup win,1.473684,1.255263,0.285001
West Ham,6,38,19,8,11,62,47,15,65,53.9,48.3,5.6,0.15,632,Tomáš Souček Michail Antonio - 10,Łukasz Fabiański,→ UEFA Europa League via league finish,1.418421,1.271053,0.280536
Tottenham,7,38,18,8,12,68,45,23,62,54.6,49.5,5.0,0.13,632,Harry Kane - 23,Hugo Lloris,→ UEFA Europa Conference League via league fin...,1.436842,1.302632,0.271816
Arsenal,8,38,18,7,13,55,39,16,61,53.5,44.3,9.2,0.24,632,Alexandre Lacazette - 13,Bernd Leno,,1.407895,1.165789,0.311677
Leeds United,9,38,18,5,15,62,54,8,59,57.5,62.9,-5.4,-0.14,421,Patrick Bamford - 17,Illan Meslier,,1.513158,1.655263,0.191042
Everton,10,38,17,8,13,47,48,-1,59,47.2,51.2,-4.1,-0.11,368,Dominic Calvert-Lewin - 16,Jordan Pickford,,1.242105,1.347368,0.259923


## xG-adjusted points

Next, we determine for each player their 'adjusted points'. To do this, we first subtract for each player all the points they have accumulated through goals, assists and clean sheets. Then, we add points for each player based on their expected goals, assists and clean sheets. This gives a much improved estimate of each player's true point generating capability. 

A player who gives an assist that directly leads to a shot, is assigned the xG-value of the shot in xA (expected assists), i.e. xA is a measure of 'goal assists'. In FPL, however, the definition of an assist is somewhat more relaxed, e.g. goals resulting from a rebound of parried shot will award an assist to the player making the initial shot. For this reason, we calculate the total number of assists awarded in FPL and the total sum of xA of all players and get an estimate of the proportion of assists that xA covers in FPL. Then, we modify that proportion of players' assists based on their xA.

In [8]:
# auxiliary information and variables
df['points_per_game'] = df['points_per_game'].astype(float)
df['games played'] = df['total_points']/df['points_per_game']
xA_proportion = playerStats['xA'].sum()/df['assists'].sum()
xA_proportion

0.7429834254143646

In [9]:
df_previous_week = pd.read_csv('../data/fpl/data_week' + str(latest_gameweek-1) + '.csv', index_col=0)
if latest_gameweek == 1:
    df_previous_week['minutes'] = 0
    df_previous_week['xG'] = np.nan
    df_previous_week['xA'] = np.nan
df_previous_week.head()

Unnamed: 0_level_0,adjusted points,adjusted points per game,assists,assists_week1,assists_week10,assists_week11,assists_week12,assists_week13,assists_week14,assists_week15,assists_week16,assists_week17,assists_week18,assists_week19,assists_week2,assists_week20,assists_week21,assists_week22,assists_week23,assists_week24,assists_week25,assists_week26,assists_week27,assists_week28,assists_week29,assists_week3,assists_week30,assists_week31,assists_week32,assists_week33,assists_week34,assists_week35,assists_week36,assists_week37,assists_week4,assists_week5,assists_week6,assists_week7,assists_week8,assists_week9,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheet_points,clean_sheets,cleansheet_week1,cleansheet_week10,cleansheet_week11,cleansheet_week12,...,xG_week33,xG_week34,xG_week35,xG_week36,xG_week37,xG_week4,xG_week5,xG_week6,xG_week7,xG_week8,xG_week9,xPoints,xPoints week 1,xPoints week 10,xPoints week 11,xPoints week 12,xPoints week 13,xPoints week 14,xPoints week 15,xPoints week 16,xPoints week 17,xPoints week 18,xPoints week 19,xPoints week 2,xPoints week 20,xPoints week 21,xPoints week 22,xPoints week 23,xPoints week 24,xPoints week 25,xPoints week 26,xPoints week 27,xPoints week 28,xPoints week 29,xPoints week 3,xPoints week 30,xPoints week 31,xPoints week 32,xPoints week 33,xPoints week 34,xPoints week 35,xPoints week 36,xPoints week 37,xPoints week 4,xPoints week 5,xPoints week 6,xPoints week 7,xPoints week 8,xPoints week 9,yellow_cards
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0.0,0.0,,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0.0,0.0,,0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3,54.566378,2.661775,0,,0.0,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,,,0.0,0.0,0.0,,,,0,230,50.0,0.0,24.966378,3,,0.0,0.0,0.0,...,,0.1,,,,0.0,0.1,0.0,,,,,,0.0,,,1.0,3.397316,,,,2.471518,7.363273,1.0,3.204777,3.586388,-3.0,,4.395247,1.0,5.026123,2.807586,3.986341,1.401035,1.199148,,,,,1.6,,,,5.274923,3.586388,1.0,,,,1
4,129.617381,4.658125,3,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,351,100.0,100.0,8.472154,9,1.0,0.0,0.0,0.0,...,,1.2,0.0,,0.2,0.0,0.0,0.3,0.9,0.0,0.3,3.332871,3.818731,4.246597,3.606531,1.70657,7.132871,,,5.449329,6.67032,2.867879,11.740818,5.61799,,,1.5,1.5,15.348812,2.122456,1.5,7.201897,,2.100259,2.049787,3.010803,,,,10.5184,4.62884,,3.332871,2.818731,3.746597,4.101194,7.540818,2.765299,3.282085,2
5,33.84713,3.384713,1,,,,,0.0,,,,,,1.0,,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,,,0.0,,,,,,,,,,,,,,3,125,100.0,100.0,12.178721,2,,,,,...,,,,,,,,,,,,,,,,,1.0,,,,,,10.658585,,3.804777,2.586388,1.986388,3.090127,4.095247,,4.426123,,3.986341,,,0.0,,,,,,,,,,,,,,1


In [10]:
if latest_gameweek > 1:
    filepath = '../data/fbref/team_stats_week' + str(latest_gameweek-1) + '.csv'
    teamStats_previous_week = pd.read_csv(filepath, index_col='Squad')
    display(teamStats_previous_week.head())
elif latest_gameweek == 1:
    teamStats_previous_week = pd.DataFrame(np.nan, index=team_names, columns=['xGA'])
    display(teamStats_previous_week)

Unnamed: 0_level_0,Rk,MP,W,D,L,GF,GA,GD,Pts,xG,xGA,xGD,xGD/90,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes,xG per game,xGA per game,probability no goals allowed
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Manchester City,1,37,26,5,6,78,32,46,83,70.7,30.1,40.5,1.1,W W L W L,,İlkay Gündoğan - 13,Ederson,,1.910811,0.813514,0.443298
Manchester Utd,2,37,20,11,6,71,43,28,71,58.5,41.4,17.1,0.46,D W L L D,,Bruno Fernandes - 18,David de Gea,,1.581081,1.118919,0.326633
Chelsea,3,37,19,10,8,57,34,23,67,61.6,31.8,29.8,0.8,W W W L W,111.0,Jorginho - 7,Edouard Mendy,,1.664865,0.859459,0.423391
Liverpool,4,37,19,9,9,66,42,24,66,70.8,44.6,26.2,0.71,D W W W W,333.0,Mohamed Salah - 22,Alisson,,1.913514,1.205405,0.299571
Leicester City,5,37,20,6,11,66,46,20,66,54.2,45.7,8.4,0.23,W D L W L,,Jamie Vardy - 13,Kasper Schmeichel,,1.464865,1.235135,0.290795


In [11]:
def points_for_goal(position):
    if position < 3:
        return 6
    elif position==3:
        return 5
    elif position==4:
        return 4

def points_for_clean_sheet(position):
    if position < 3:
        return 4
    elif position==3:
        return 1
    elif position==4:
        return 0

In [12]:
# column names to be used below
xPoints_column = 'xPoints week ' + str(latest_gameweek)
xG_column = 'xG_week' + str(latest_gameweek) 
goals_column = 'goals_week' + str(latest_gameweek) 
xGA_column = 'xGA_week' + str(latest_gameweek) 
cleansheet_column = 'cleansheet_week' + str(latest_gameweek) 
xA_column = 'xA_week' + str(latest_gameweek) 
assists_column = 'assists_week' + str(latest_gameweek) 

In [13]:
def incorporate_xG(indicator, ix):
    xG = playerStats.loc[indicator, 'xG'].values[0]  
    penalty_attempts = playerStats.loc[indicator,'PKatt'].values[0]
    position = df.loc[ix, 'element_type']
    
    # 0.24 is the probability to miss a penalty, which incurs -2 points
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'total_points'] \
                                        - points_for_goal(position) * (df.loc[ix, 'goals_scored'] - xG) \
                                        - 2 * 0.24 * penalty_attempts    
    df.loc[ix, 'xG'] = xG
    df.loc[ix, 'xG_points'] = points_for_goal(position)*xG - 2 * 0.24 * penalty_attempts
    
    try:
        if df.loc[ix,'minutes'] > df_previous_week.loc[ix,'minutes']:
            if ~np.isnan(df_previous_week.loc[ix, 'xG']):
                df.loc[ix, xG_column] = df.loc[ix, 'xG'] - df_previous_week.loc[ix, 'xG']
                df.loc[ix, goals_column] = df.loc[ix, 'goals_scored'] - df_previous_week.loc[ix, 'goals_scored']
                df.loc[ix, xPoints_column] = df.loc[ix, 'event_points'] - points_for_goal(position) * \
                                                        (df.loc[ix, goals_column] - df.loc[ix, xG_column])
            else:
                df.loc[ix, xG_column] = xG
                df.loc[ix, goals_column] = df.loc[ix, 'goals_scored']
                df.loc[ix, xPoints_column] = df.loc[ix, 'event_points'] - points_for_goal(position) * \
                                                        (df.loc[ix, goals_column] - df.loc[ix, xG_column])
    except KeyError:  
        print(str(ix) + ' is a new index. (xG)')

In [14]:
# always run 'team_xGA' AFTER 'incorporate_xG'
def team_xGA(indicator, ix):
    team = team_names[df.loc[ix, 'team']-1]
    clean_sheets = df.loc[ix, 'clean_sheets']
    probability_cleanSheet = teamStats.loc[team, 'probability no goals allowed']
    position = df.loc[ix, 'element_type']
    
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] - points_for_clean_sheet(position) * \
                                ( df.loc[ix, 'clean_sheets'] - probability_cleanSheet*df.loc[ix, 'games played'] )
    df.loc[ix, 'clean_sheet_points'] = points_for_clean_sheet(position) * \
                                       df.loc[ix, 'games played'] * probability_cleanSheet
    try:
        if ~np.isnan(teamStats_previous_week.loc[team, 'xGA']):            
            df.loc[ix, xGA_column] = teamStats.loc[team, 'xGA'] - teamStats_previous_week.loc[team, 'xGA']
            df.loc[ix, cleansheet_column] = df.loc[ix, 'clean_sheets'] - df_previous_week.loc[ix, 'clean_sheets']
        else:
            df.loc[ix, xGA_column] = teamStats.loc[team, 'xGA']
            df.loc[ix, cleansheet_column] = df.loc[ix, 'clean_sheets']
        
        if df.loc[ix,'minutes'] - df_previous_week.loc[ix,'minutes'] >= 60:
            probability_cleansheet_thisweek = poisson.pmf(0,df.loc[ix, xGA_column])
            df.loc[ix, xPoints_column] = df.loc[ix, xPoints_column] - points_for_clean_sheet(position) * \
                                            (df.loc[ix, cleansheet_column] - probability_cleansheet_thisweek)
        
    except KeyError:  
        print(str(ix) + ' is a new index. (xGA)')

In [15]:
# always run 'xA' AFTER 'incorporate_xG'
def xA(indicator, ix):
    xA = playerStats.loc[indicator, 'xA'].values[0]
    df.loc[ix, 'adjusted points'] =  df.loc[ix, 'adjusted points'] -  3 * (xA_proportion*df.loc[ix, 'assists'] - xA)
    df.loc[ix, 'xA'] = xA
    df.loc[ix, 'xA_points'] = 3*xA
    try:
        if df.loc[ix,'minutes'] > df_previous_week.loc[ix,'minutes']:
            if ~np.isnan(df_previous_week.loc[ix, 'xA']):
                df.loc[ix, xA_column] = df.loc[ix, 'xA'] - df_previous_week.loc[ix, 'xA']
                df.loc[ix, assists_column] = df.loc[ix, 'assists'] - df_previous_week.loc[ix, 'assists']
                df.loc[ix, xPoints_column] = df.loc[ix, xPoints_column] - \
                                    3 * (xA_proportion*df.loc[ix, assists_column] - df.loc[ix, xA_column])
            else:
                df.loc[ix, xA_column] = xA
                df.loc[ix, assists_column] = df.loc[ix, 'assists']
                df.loc[ix, xPoints_column] = df.loc[ix, xPoints_column] - \
                                            3*(xA_proportion*df.loc[ix, assists_column] - df.loc[ix, xA_column])
    except KeyError:  
        print(str(ix) + ' is a new index. (xA)')

In [16]:
def calculateAdjustedPoints(indicator, index):
    incorporate_xG(indicator, index)
    team_xGA(indicator, index)
    xA(indicator, index)

## Main loop for assigning adjusted points

Below is the main loop where we calculate adjusted points for each player. Calculation of the adjusted points itself is straightforward, but there is some work required to match players in two different data sets. Comparing player names in both data sets gives unique matches in many cases, but some special cases need to be covered through individual solutions.

In [17]:
for ix in df[df['minutes']>0].index: 
    # player name in FPL data
    name = df.loc[ix, 'web_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u').replace('ö', 'o').\
                        replace('ä', 'a')
    # find FBREF data indexes where player name contains 'name'
    indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains(name)
    # if unique match is found, we can calculate 'adjusted points'
    if playerStats.loc[indicator].shape[0]==1:
        calculateAdjustedPoints(indicator, ix)
    # if no match is found
    elif playerStats.loc[indicator].shape[0]==0:
        # try matching based on first name, helps in some cases
        first_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '').replace('ü', 'u')\
                                                                    .replace('ö', 'o').replace('ä', 'a')
        first_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(first_name)
        # if unique match is found, we can calculate 'adjusted points'
        if playerStats.loc[first_name_indicator].shape[0]==1:
            calculateAdjustedPoints(first_name_indicator, ix)
        # here we deal with some special cases individually
        else:
            names = ['rodrigo','elliott','mattylongstaff',"n'lundulu",'carlosvinicius']
            if name == 'rodrigo':
                exceptional_case_indicator = \
                (playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains('rodri')) & \
                (~playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '').str.contains('rodriguez'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == 'elliott':
                exceptional_case_indicator = (playerStats['Player'].str.lower().str.replace(' ', '')\
                                              .str.replace('-', '').str.contains('elliot'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == 'mattylongstaff':
                exceptional_case_indicator = (playerStats['Player'].str.lower().str.contains('matthew longstaff'))
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == "n'lundulu":
                exceptional_case_indicator = playerStats['Player'].str.lower().str.replace(' ', '')\
                                                .str.replace('-', '').str.contains('lundulu')
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            elif name == 'carlosvinicius':
                exceptional_case_indicator = playerStats['Player'].str.lower().str.replace(' ', '')\
                                                .str.replace('-', '').str.contains('carlos')
                calculateAdjustedPoints(exceptional_case_indicator, ix)
            else:
                print(str(ix) + ': no player found.')
    # if more than one matches were found in the original comparison
    elif playerStats.loc[indicator].shape[0]>1:
        # try matching full name
        full_name = df.loc[ix, 'first_name'].lower().replace(' ', '').replace('-', '') \
            + df.loc[ix, 'second_name'].lower().replace(' ', '').replace('-', '')
        full_name_indicator = playerStats['Player'].str.lower().str.replace(' ', '').str.replace('-', '')\
                                                                            .str.contains(full_name)
        # if unique match is found, we can calculate 'adjusted points'
        if playerStats.loc[full_name_indicator].shape[0]==1:
            calculateAdjustedPoints(full_name_indicator, ix)
        # here we try to match players based both on their name and team
        else:
            team = team_names[df.loc[ix, 'team']-1]
            team_indicator = playerStats['Squad']==team
            if playerStats.loc[indicator & team_indicator].shape[0]==1:
                calculateAdjustedPoints(indicator & team_indicator, ix)
            # special case
            else:
                if name=='son':
                    exceptional_case_indicator = playerStats['Player'].str.lower().str.replace(' ', '')\
                                                .str.replace('-', '').str.contains('heung')
                    calculateAdjustedPoints(exceptional_case_indicator, ix)
                else:
                    print(str(ix) + ': non-unique name.')
    else:
        print(str(ix) + 'Player not found')

708 is a new index. (xG)
708 is a new index. (xGA)
708 is a new index. (xA)
712 is a new index. (xG)
712 is a new index. (xGA)
712 is a new index. (xA)
713 is a new index. (xG)
713 is a new index. (xGA)
713 is a new index. (xA)


In [18]:
df['adjusted points per game'] = df['adjusted points'] / df['games played']

if (latest_gameweek > 1) & (df_online==1):
    column_list = []
    for i in range(1,latest_gameweek):
        column_list.append('xPoints week ' + str(i))
        column_list.append('xG_week' + str(i))
        column_list.append('goals_week' + str(i))
        column_list.append('xGA_week' + str(i))
        column_list.append('cleansheet_week' + str(i))
        column_list.append('xA_week' + str(i))
        column_list.append('assists_week' + str(i))
    df = df.join( df_previous_week[column_list])

df.head()

Unnamed: 0_level_0,assists,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,corners_and_indirect_freekicks_order,corners_and_indirect_freekicks_text,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,creativity,creativity_rank,creativity_rank_type,direct_freekicks_order,direct_freekicks_text,dreamteam_count,element_type,ep_next,ep_this,event_points,first_name,form,goals_conceded,goals_scored,ict_index,ict_index_rank,ict_index_rank_type,in_dreamteam,influence,influence_rank,influence_rank_type,minutes,news,news_added,now_cost,own_goals,penalties_missed,penalties_order,penalties_saved,penalties_text,photo,points_per_game,red_cards,saves,second_name,selected_by_percent,special,...,assists_week30,xPoints week 31,xG_week31,goals_week31,xGA_week31,cleansheet_week31,xA_week31,assists_week31,xPoints week 32,xG_week32,goals_week32,xGA_week32,cleansheet_week32,xA_week32,assists_week32,xPoints week 33,xG_week33,goals_week33,xGA_week33,cleansheet_week33,xA_week33,assists_week33,xPoints week 34,xG_week34,goals_week34,xGA_week34,cleansheet_week34,xA_week34,assists_week34,xPoints week 35,xG_week35,goals_week35,xGA_week35,cleansheet_week35,xA_week35,assists_week35,xPoints week 36,xG_week36,goals_week36,xGA_week36,cleansheet_week36,xA_week36,assists_week36,xPoints week 37,xG_week37,goals_week37,xGA_week37,cleansheet_week37,xA_week37,assists_week37
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,0,0,0,0.0,0.0,0,37605,,,0,0,-3,3,0.0,713,295,,,0,3,0.0,0.0,0,Mesut,0.0,0,0,0.0,713,295,False,0.0,713,295,0,Not included in Arsenal's 25-man Premier Leagu...,2020-10-20T22:30:18.118477Z,67,0,0,,0,,37605.jpg,0.0,0,0,Özil,0.5,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,0,0,0.0,0.0,0,39476,,,0,0,-2,2,0.0,674,241,,,0,2,0.0,0.0,0,Sokratis,0.0,0,0,0.0,685,240,False,0.0,682,240,0,Left the club by mutual consent on 20/1,2020-10-21T10:30:18.546407Z,48,0,0,,0,,39476.jpg,0.0,0,0,Papastathopoulos,0.1,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,0,230,100.0,50.0,3,41270,,,0,0,-1,1,46.7,346,119,4.0,,0,2,0.3,0.2,0,David,0.0,20,1,43.9,310,105,False,250.6,264,101,1396,,2021-05-02T16:00:26.134258Z,54,0,0,,0,,41270.jpg,2.0,1,0,Luiz Moreira Marinho,0.8,False,...,,,,,0.4,0.0,,,,,,0.9,0.0,,,,,,0.8,0.0,,,1.6,0.1,0.0,0.3,0.0,0.0,0.0,,,,3.3,0.0,,,,,,0.0,0.0,,,,,,1.1,0.0,,
4,3,11,358,100.0,100.0,10,54694,,,0,0,-7,7,361.5,101,71,,,3,3,4.5,3.0,3,Pierre-Emerick,2.5,26,10,167.4,52,31,False,493.8,136,49,2330,,2021-04-12T08:30:26.681398Z,113,1,0,1.0,0,,54694.jpg,4.5,0,0,Aubameyang,7.2,False,...,0.0,,,,0.4,0.0,,,,,,0.9,0.0,,,,,,0.8,0.0,,,10.5184,1.2,1.0,0.3,1.0,0.0,1.0,4.62884,0.0,0.0,3.3,1.0,0.6,1.0,,,,0.0,0.0,,,3.332871,0.2,0.0,1.1,0.0,0.0,0.0
5,1,3,125,100.0,100.0,2,58822,,,0,0,-4,4,114.8,249,65,5.0,,0,2,0.5,0.5,0,Cédric,0.0,11,0,29.2,356,125,False,110.8,366,140,744,,2020-09-23T09:00:14.881983Z,46,0,0,,0,,58822.jpg,2.8,0,0,Soares,0.3,False,...,0.0,,,,0.4,0.0,,,,,,0.9,0.0,,,,,,0.8,0.0,,,,,,0.3,0.0,,,,,,3.3,0.0,,,,,,0.0,0.0,,,,,,1.1,0.0,,


In [19]:
# give a sorted list showing the players with highest 'adjusted points per game'
df[['web_name', 'games played','total_points', 'points_per_game','adjusted points','adjusted points per game']]\
                                .sort_values(by='adjusted points per game', ascending=False)

Unnamed: 0_level_0,web_name,games played,total_points,points_per_game,adjusted points,adjusted points per game
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
608,Carson,1.000000,7,7.0,8.750635,8.750635
272,De Bruyne,25.178571,141,5.6,160.932219,6.391634
388,Kane,35.072464,242,6.9,221.074696,6.303369
302,Fernandes,36.969697,244,6.6,227.171980,6.144816
254,Salah,37.258065,231,6.2,228.457156,6.131751
224,Vardy,34.000000,187,5.5,184.974696,5.440432
251,Mané,35.200000,176,5.0,188.267615,5.348512
390,Son,36.774194,228,6.2,193.197344,5.253612
276,Sterling,30.800000,154,5.0,158.339337,5.140888
271,Gündogan,28.035714,157,5.6,140.894274,5.025528


In [20]:
# save data
filepath = '../data/fpl/data_week' + str(latest_gameweek) + str('.csv')
df.to_csv(filepath)

filepath = '../data/fbref/team_stats_week' + str(latest_gameweek) + '.csv'
teamStats.to_csv(filepath)

Below we check how well the total xG matches the total scored goals.

In [21]:
total_xG = playerStats['xG'].sum()
total_goals = df['goals_scored'].sum()
print('Total goals: ' + str(total_goals))
print('Total xG: ' + str(total_xG))
print('goals per xG: ' + str(total_goals/total_xG))

Total goals: 986
Total xG: 1014.4000000000001
goals per xG: 0.9720031545741324
