<a href="https://colab.research.google.com/github/huricane85/fpl2020_python/blob/master/Understat_over_and_under.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!git clone https://github.com/doughagey/UnderstatScraper

Cloning into 'UnderstatScraper'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 34 (delta 11), reused 25 (delta 2), pack-reused 0[K
Unpacking objects: 100% (34/34), done.


In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mar 1 17:50:36 2020
@author: doug hagey
"""

import pandas as pd
import requests

def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    if weeks != '3wks':
        player_df.drop(['position','team_title'], axis=1, inplace=True)
    return(player_df)

#Create Pandas dataframes from each html table
print('Getting data for last 3 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2020', 'n_last_matches': '3'})
three_game_table = pd.DataFrame(json_player_data)
three_game_df = clean_df(three_game_table,'3wks')
#Replace Position indentifiers with something more useful
three_game_df['position'] = three_game_df['position'].str.slice(0,1)
position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
three_game_df = three_game_df.replace({'position': position_map})


print('Getting data for the whole season')
json_player_data = scrape_understat({'league':'EPL', 'season':'2020'})
season_table = pd.DataFrame(json_player_data)
season_df = clean_df(season_table, 'season')

print('Getting data for last 5 matches')
json_player_data = scrape_understat({'league':'EPL', 'season':'2020', 'n_last_matches': '5'})
five_game_table = pd.DataFrame(json_player_data)
five_game_df = clean_df(five_game_table, '5wks')

print('Merging Tables')
EPL_player_df = pd.merge(three_game_df, season_df, on=['id','player_name'])
EPL_player_df = pd.merge(EPL_player_df, five_game_df, on=['id','player_name'])


print('Writing CSV File')
EPL_player_df.to_csv('Understat_EPL_Player_Data_Combined 2020.csv', encoding='utf-8', index=False)

Getting data for last 3 matches
Getting data for the whole season
Getting data for last 5 matches
Merging Tables
Writing CSV File


In [8]:
season_df

Unnamed: 0,id,player_name,games,time,goals_season,xG_season,assists_season,xA_season,shots_season,key_passes_season,yellow_cards,red_cards,npg_season,npxG_season,xGChain,xGBuildup
0,453,Son Heung-Min,5,387,7,2.5682125985622406,2,1.7257845848798752,12,15,0,0,7,2.5682125985622406,5.000036120414734,1.8340720646083355
1,5555,Dominic Calvert-Lewin,5,433,7,5.044697925448418,0,0.10603856295347214,20,1,0,0,7,5.044697925448418,4.703015625476837,1.1586278155446053
2,1250,Mohamed Salah,5,450,6,3.3498291969299316,0,1.8358780294656754,25,15,0,0,4,1.8274915516376495,5.041967034339905,2.0600571036338806
3,647,Harry Kane,5,447,5,4.570508688688278,7,3.213084191083908,25,13,0,0,4,3.809339851140976,6.383290737867355,0.7156221270561218
4,755,Jamie Vardy,4,355,5,3.8792811036109924,0,0.16242989152669907,7,2,0,0,1,0.8346057236194611,1.2137065902352333,0.2166709490120411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,8961,Rúben Dias,2,180,0,0.08490262925624847,0,0,1,0,1,0,0,0.08490262925624847,0.6797540932893753,0.6797540932893753
383,8965,Vladimir Coufal,2,180,0,0.10235214233398438,0,0.08963871002197266,1,2,1,0,0,0.10235214233398438,0.45219680666923523,0.2602059543132782
384,8966,Filip Krovinovic,2,106,0,0.03914885222911835,0,0,1,0,0,0,0,0.03914885222911835,0.16549944877624512,0.16549944877624512
385,8992,Hakim Ziyech,1,16,0,0,0,0.10719645023345947,0,1,0,0,0,0,0,0


In [9]:
season_df['goals_season'] = season_df['goals_season'].astype(int)
season_df['xG_season'] = season_df['xG_season'].astype(float)

In [10]:
season_df.dtypes

id                    object
player_name           object
games                 object
time                  object
goals_season           int64
xG_season            float64
assists_season        object
xA_season             object
shots_season          object
key_passes_season     object
yellow_cards          object
red_cards             object
npg_season            object
npxG_season           object
xGChain               object
xGBuildup             object
dtype: object

In [11]:
season_df['xgdifference'] = season_df.goals_season - season_df.xG_season

In [12]:
season_df

Unnamed: 0,id,player_name,games,time,goals_season,xG_season,assists_season,xA_season,shots_season,key_passes_season,yellow_cards,red_cards,npg_season,npxG_season,xGChain,xGBuildup,xgdifference
0,453,Son Heung-Min,5,387,7,2.568213,2,1.7257845848798752,12,15,0,0,7,2.5682125985622406,5.000036120414734,1.8340720646083355,4.431787
1,5555,Dominic Calvert-Lewin,5,433,7,5.044698,0,0.10603856295347214,20,1,0,0,7,5.044697925448418,4.703015625476837,1.1586278155446053,1.955302
2,1250,Mohamed Salah,5,450,6,3.349829,0,1.8358780294656754,25,15,0,0,4,1.8274915516376495,5.041967034339905,2.0600571036338806,2.650171
3,647,Harry Kane,5,447,5,4.570509,7,3.213084191083908,25,13,0,0,4,3.809339851140976,6.383290737867355,0.7156221270561218,0.429491
4,755,Jamie Vardy,4,355,5,3.879281,0,0.16242989152669907,7,2,0,0,1,0.8346057236194611,1.2137065902352333,0.2166709490120411,1.120719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,8961,Rúben Dias,2,180,0,0.084903,0,0,1,0,1,0,0,0.08490262925624847,0.6797540932893753,0.6797540932893753,-0.084903
383,8965,Vladimir Coufal,2,180,0,0.102352,0,0.08963871002197266,1,2,1,0,0,0.10235214233398438,0.45219680666923523,0.2602059543132782,-0.102352
384,8966,Filip Krovinovic,2,106,0,0.039149,0,0,1,0,0,0,0,0.03914885222911835,0.16549944877624512,0.16549944877624512,-0.039149
385,8992,Hakim Ziyech,1,16,0,0.000000,0,0.10719645023345947,0,1,0,0,0,0,0,0,0.000000


In [13]:
season_df['assists_season'] = season_df['assists_season'].astype(int)
season_df['xA_season'] = season_df['xA_season'].astype(float)

In [14]:
season_df['xAdifference'] = season_df.assists_season - season_df.xA_season

In [15]:
season_df

Unnamed: 0,id,player_name,games,time,goals_season,xG_season,assists_season,xA_season,shots_season,key_passes_season,yellow_cards,red_cards,npg_season,npxG_season,xGChain,xGBuildup,xgdifference,xAdifference
0,453,Son Heung-Min,5,387,7,2.568213,2,1.725785,12,15,0,0,7,2.5682125985622406,5.000036120414734,1.8340720646083355,4.431787,0.274215
1,5555,Dominic Calvert-Lewin,5,433,7,5.044698,0,0.106039,20,1,0,0,7,5.044697925448418,4.703015625476837,1.1586278155446053,1.955302,-0.106039
2,1250,Mohamed Salah,5,450,6,3.349829,0,1.835878,25,15,0,0,4,1.8274915516376495,5.041967034339905,2.0600571036338806,2.650171,-1.835878
3,647,Harry Kane,5,447,5,4.570509,7,3.213084,25,13,0,0,4,3.809339851140976,6.383290737867355,0.7156221270561218,0.429491,3.786916
4,755,Jamie Vardy,4,355,5,3.879281,0,0.162430,7,2,0,0,1,0.8346057236194611,1.2137065902352333,0.2166709490120411,1.120719,-0.162430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,8961,Rúben Dias,2,180,0,0.084903,0,0.000000,1,0,1,0,0,0.08490262925624847,0.6797540932893753,0.6797540932893753,-0.084903,0.000000
383,8965,Vladimir Coufal,2,180,0,0.102352,0,0.089639,1,2,1,0,0,0.10235214233398438,0.45219680666923523,0.2602059543132782,-0.102352,-0.089639
384,8966,Filip Krovinovic,2,106,0,0.039149,0,0.000000,1,0,0,0,0,0.03914885222911835,0.16549944877624512,0.16549944877624512,-0.039149,0.000000
385,8992,Hakim Ziyech,1,16,0,0.000000,0,0.107196,0,1,0,0,0,0,0,0,0.000000,-0.107196


In [16]:
list(season_df.columns) 

['id',
 'player_name',
 'games',
 'time',
 'goals_season',
 'xG_season',
 'assists_season',
 'xA_season',
 'shots_season',
 'key_passes_season',
 'yellow_cards',
 'red_cards',
 'npg_season',
 'npxG_season',
 'xGChain',
 'xGBuildup',
 'xgdifference',
 'xAdifference']

In [28]:
seasonedit = season_df.drop(columns=[ 'shots_season','key_passes_season','yellow_cards','red_cards','games','time','id'])

In [31]:
sortXgDifference.tail(10)

Unnamed: 0,player_name,goals_season,xG_season,assists_season,xA_season,npg_season,npxG_season,xGChain,xGBuildup,xgdifference,xAdifference
15,James Rodríguez,3,1.652871,3,2.688169,3,1.6528714280575514,4.557198107242584,2.289410252124071,1.347129,0.311831
11,Patrick Bamford,3,1.574692,1,0.178102,3,1.5746915116906166,2.1813396960496902,0.6266722958534956,1.425308,0.821898
27,Hélder Costa,2,0.319938,1,0.133443,2,0.3199381157755852,1.1558455005288124,0.7802861519157887,1.680062,0.866557
8,Danny Ings,4,2.297998,0,0.179144,3,1.5368288680911064,1.96504345536232,0.2790215350687504,1.702002,-0.179144
25,Kurt Zouma,2,0.199481,0,0.0,2,0.1994811687618494,0.4348131790757179,0.4348131790757179,1.800519,0.0
29,Callum Robinson,2,0.179576,0,0.1531,2,0.179575752466917,0.4729029089212417,0.1402267962694168,1.820424,-0.1531
10,Jack Grealish,3,1.122298,3,2.292805,3,1.122298389673233,3.72839030623436,1.3058247417211533,1.877702,0.707195
1,Dominic Calvert-Lewin,7,5.044698,0,0.106039,7,5.044697925448418,4.703015625476837,1.1586278155446053,1.955302,-0.106039
2,Mohamed Salah,6,3.349829,0,1.835878,4,1.827491551637649,5.041967034339905,2.0600571036338806,2.650171,-1.835878
0,Son Heung-Min,7,2.568213,2,1.725785,7,2.5682125985622406,5.000036120414734,1.8340720646083355,4.431787,0.274215


In [30]:
sortXgDifference = seasonedit.sort_values('xgdifference')
sortXgDifference.head(10)

Unnamed: 0,player_name,goals_season,xG_season,assists_season,xA_season,npg_season,npxG_season,xGChain,xGBuildup,xgdifference,xAdifference
83,Che Adams,1,2.959361,2,0.977709,1,2.9593605250120163,2.749407470226288,0.324525224044919,-1.959361,1.022291
65,Richarlison,1,2.667717,2,1.076888,0,1.9065477456897495,3.489486128091812,0.876866552978754,-1.667717,0.923112
352,Tomas Soucek,0,1.627104,0,0.321177,0,1.6271042302250862,2.4397603273391724,1.023098036646843,-1.627104,-0.321177
335,John Lundstram,0,1.307487,0,0.322004,0,0.5463180281221867,1.963067751377821,1.200004294514656,-1.307487,-0.322004
175,Georginio Wijnaldum,0,1.204184,0,0.221347,0,1.2041840814054012,2.0312648564577103,0.6543126478791237,-1.204184,-0.221347
110,Roberto Firmino,0,0.932069,2,1.042994,0,0.9320692270994186,3.4335117638111115,1.8970523476600647,-0.932069,0.957006
20,Michail Antonio,2,2.85346,0,0.337028,2,2.8534597158432007,3.5406220853328705,0.6049350053071976,-0.85346,-0.337028
363,Stuart Dallas,0,0.847682,0,0.281276,0,0.8476816415786743,1.8959107920527456,1.6517867669463158,-0.847682,-0.281276
220,Oliver McBurnie,0,0.846212,0,0.111046,0,0.8462118059396744,0.9522430524230004,0.0850220397114753,-0.846212,-0.111046
158,Willian,0,0.784416,2,1.087347,0,0.7844159603118896,2.2122746407985687,1.0770129561424255,-0.784416,0.912653


In [22]:
sortXgDifference.tail(10)

Unnamed: 0,id,player_name,games,time,goals_season,xG_season,assists_season,xA_season,shots_season,key_passes_season,yellow_cards,red_cards,npg_season,npxG_season,xGChain,xGBuildup,xgdifference,xAdifference
15,2249,James Rodríguez,5,431,3,1.652871,3,2.688169,12,15,1,0,3,1.6528714280575514,4.557198107242584,2.289410252124071,1.347129,0.311831
11,822,Patrick Bamford,5,406,3,1.574692,1,0.178102,14,2,1,0,3,1.5746915116906166,2.1813396960496902,0.6266722958534956,1.425308,0.821898
27,3428,Hélder Costa,5,410,2,0.319938,1,0.133443,6,2,0,0,2,0.3199381157755852,1.1558455005288124,0.7802861519157887,1.680062,0.866557
8,986,Danny Ings,5,450,4,2.297998,0,0.179144,12,3,0,0,3,1.5368288680911064,1.96504345536232,0.2790215350687504,1.702002,-0.179144
25,935,Kurt Zouma,4,360,2,0.199481,0,0.0,5,0,0,0,2,0.1994811687618494,0.4348131790757179,0.4348131790757179,1.800519,0.0
29,4476,Callum Robinson,5,295,2,0.179576,0,0.1531,4,2,0,0,2,0.179575752466917,0.4729029089212417,0.1402267962694168,1.820424,-0.1531
10,675,Jack Grealish,4,360,3,1.122298,3,2.292805,9,14,0,0,3,1.122298389673233,3.72839030623436,1.3058247417211533,1.877702,0.707195
1,5555,Dominic Calvert-Lewin,5,433,7,5.044698,0,0.106039,20,1,0,0,7,5.044697925448418,4.703015625476837,1.1586278155446053,1.955302,-0.106039
2,1250,Mohamed Salah,5,450,6,3.349829,0,1.835878,25,15,0,0,4,1.827491551637649,5.041967034339905,2.0600571036338806,2.650171,-1.835878
0,453,Son Heung-Min,5,387,7,2.568213,2,1.725785,12,15,0,0,7,2.5682125985622406,5.000036120414734,1.8340720646083355,4.431787,0.274215
