In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [17]:
#Load raw player info
players_info = pd.read_csv("../raw_data/data/data/2019-20/players_raw.csv")
#Contains info like the position (which will be pretty important)
#element_type contains the positions 1 = GK 2 = DEF 3 = MID 4 = FWD

#load player id list
players_id_map = pd.read_csv("../raw_data/data/data/2019-20/player_idlist.csv")
players_id_map["name"] = players_id_map.apply(lambda x: x.first_name + "_" + x.second_name + "_" + str(x.id), axis = 1)
players_id_map = pd.merge(players_id_map, players_info[["id","element_type"]], left_on = "id", right_on = "id")
players_id_map.head()

Unnamed: 0,first_name,second_name,id,name,element_type
0,Shkodran,Mustafi,1,Shkodran_Mustafi_1,2
1,Héctor,Bellerín,2,Héctor_Bellerín_2,2
2,Sead,Kolasinac,3,Sead_Kolasinac_3,2
3,Ainsley,Maitland-Niles,4,Ainsley_Maitland-Niles_4,2
4,Sokratis,Papastathopoulos,5,Sokratis_Papastathopoulos_5,2


In [19]:
#Set path
player_path = "../raw_data/data/data/2019-20/players/{}/gw.csv"
gw_path = "../raw_data/data/data/2019-20/gws/{}.csv" #GWs from 1 to 47

In [36]:

#Explore player consistency across game weeks and also across home/away
all_players = list(players_id_map["name"])

all_players_df_list = [] #list of all players summary df
for name in all_players:
    print(name)
    curr_path = player_path.format(name)
    curr_player_data = pd.read_csv(curr_path)
    curr_player_data = curr_player_data.loc[curr_player_data["minutes"] > 0] #Player has to play here
    if len(curr_player_data) < 20:
        continue #Has played at least 20 matches 
    curr_player_data["yield"] = curr_player_data["total_points"]/curr_player_data["value"] #Adjust player points by cost
    
    #Gather summary stats
    home_data = curr_player_data.loc[curr_player_data["was_home"] == True]
    away_data = curr_player_data.loc[curr_player_data["was_home"] == False]
    yield_mean = curr_player_data["yield"].mean()
    yield_var = curr_player_data["yield"].var()
    home_yield_mean = home_data["yield"].mean()
    home_yield_var = home_data["yield"].var()
    away_yield_mean = away_data["yield"].mean()
    away_yield_var = away_data["yield"].var()
    
    total_points_mean = curr_player_data["total_points"].mean()
    total_points_var = curr_player_data["total_points"].var()
    home_points_mean = home_data["total_points"].mean()
    home_points_var = home_data["total_points"].var()
    away_points_mean = away_data["total_points"].mean()
    away_points_var = away_data["total_points"].var()
    
    #Get momentum stats as well
    points_corr_coef = np.corrcoef(curr_player_data["total_points"][:-1], curr_player_data["total_points"].shift(-1)[:-1])[0][1]
    
    player_summary = {"yield_mean": [yield_mean], "yield_var": [yield_var], "home_yield_mean": [home_yield_mean],
                     "home_yield_var": [home_yield_mean], "away_yield_mean": [away_yield_mean], 
                     "away_yield_var": [away_yield_var], "points_coef": [points_corr_coef],
                     "points_mean": [total_points_mean], "points_var": [total_points_var],
                     "home_points_mean": [home_points_mean], "home_points_var": [home_points_var],
                     "away_points_mean": [away_points_mean], "away_points_var": [away_points_var]}
    curr_player_df = pd.DataFrame(player_summary)
    curr_player_df.index = [name]
    all_players_df_list.append(curr_player_df)

all_players_df = pd.concat(all_players_df_list)
    
    

Shkodran_Mustafi_1
Héctor_Bellerín_2
Sead_Kolasinac_3
Ainsley_Maitland-Niles_4
Sokratis_Papastathopoulos_5
Nacho_Monreal_6
Laurent_Koscielny_7
Konstantinos_Mavropanos_8
Carl_Jenkinson_9
Rob_Holding_10
Pierre-Emerick_Aubameyang_11
Alexandre_Lacazette_12
Edward_Nketiah_13
Bernd_Leno_14
Mesut_Özil_15
Henrikh_Mkhitaryan_16
Granit_Xhaka_18
Lucas_Torreira_19
Mohamed_Elneny_20
Matteo_Guendouzi_21
David_Luiz Moreira Marinho_106
Emiliano_Martínez_427
Calum_Chambers_467
Daniel_Ceballos Fernández_469
Cédric_Soares_486
Nicolas_Pépé_488
Reiss_Nelson_489
Joseph_Willock_490
Gabriel Teodoro_Martinelli Silva_504
Kieran_Tierney_515
Bukayo_Saka_541
Emile_Smith Rowe_576
Tyreece_John-Jules_585
Pablo_Marí_617
Zech_Medley_636
Matt_Macey_646
Matthew_Smith_647
Ahmed_El Mohamady_22
James_Chester_23
Neil_Taylor_24
Kortney_Hause_25
Jonathan_Kodjia_26
Ørjan_Nyland_27
Conor_Hourihane_28
Jack_Grealish_29
Anwar_El Ghazi_30
John_McGinn_31
Andre_Green_32
Birkir_Bjarnason_33
Keinan_Davis_34
Henri_Lansbury_35
José Ignaci

Christoph_Zimmermann_275
Jamal_Lewis_276
Grant_Hanley_277
Teemu_Pukki_278
Dennis_Srbeny_279
Tim_Krul_280
Michael_McGovern_281
Mario_Vrancic_282
Emiliano_Buendía_283
Onel_Hernández_284
Marco_Stiepermann_285
Kenny_McLean_286
Todd_Cantwell_287
Moritz_Leitner_288
Alexander_Tettey_289
Tom_Trybull_290
Josip_Drmic_434
Patrick_Roberts_435
Ralf_Fahrmann_449
Sam_Byram_454
Philip_Heise_484
Ibrahim_Amadou_507
Adam_Idah_538
Archie_Mair_551
Akin_Famewo_552
Ondrej_Duda_601
Lukas_Rupp_602
Josh_Martin_632
Jordan_Thomas_650
Daniel_Adshead_669
Lys_Mousset_70
Enda_Stevens_291
Kieron_Freeman_292
Jack_O'Connell_293
George_Baldock_294
John_Egan_295
Richard_Stearman_296
John_Lundstram_297
Billy_Sharp_298
Jake_Eastwood_299
Mark_Duffy_300
John_Fleck_301
Oliver_Norwood_302
David_McGoldrick_303
Chris_Basham_423
Simon_Moore_436
Luke_Freeman_441
Phil_Jagielka_444
Callum_Robinson_453
Ravel_Morrison_456
Dean_Henderson_471
Ben_Osborn_472
Oliver_McBurnie_501
Michael_Verrips_521
Muhamed_Bešić_522
Leon_Clarke_547
Jack_Ro

In [37]:
all_players_df = pd.merge(all_players_df, players_id_map[["name", "element_type"]], left_index = True, right_on = "name")
all_players_df.head()

Unnamed: 0,yield_mean,yield_var,home_yield_mean,home_yield_var,away_yield_mean,away_yield_var,points_coef,points_mean,points_var,home_points_mean,home_points_var,away_points_mean,away_points_var,name,element_type
2,0.040105,0.001371,0.040522,0.040522,0.039618,0.001107,0.025278,2.115385,3.786154,2.142857,4.747253,2.083333,2.992424,Sead_Kolasinac_3,2
3,0.043169,0.003896,0.02631,0.02631,0.060027,0.00544,-0.182636,2.05,9.418421,1.2,4.844444,2.9,13.433333,Ainsley_Maitland-Niles_4,2
10,0.05206,0.001692,0.058913,0.058913,0.045206,0.001422,-0.19149,5.694444,20.218254,6.444444,23.437908,4.944444,16.996732,Pierre-Emerick_Aubameyang_11,4
11,0.042305,0.001193,0.058001,0.058001,0.02661,0.000539,-0.201995,3.933333,10.271264,5.4,12.114286,2.466667,4.552381,Alexandre_Lacazette_12,4
13,0.076,0.002825,0.085333,0.085333,0.066667,0.001581,-0.17361,3.8,7.062069,4.266667,10.209524,3.333333,3.952381,Bernd_Leno_14,1


In [70]:
#1 = GK 2 = DEF 3 = MID 4 = FWD
positions_dict = {}
player_stats_group = all_players_df.groupby(by = "element_type")
for position, position_summary in player_stats_group:
    position_summary.loc[:, "var_adjusted_points"] =( position_summary["points_mean"] - position_summary["points_mean"].mean())/(position_summary["points_var"]**0.5)
    positions_dict[position] = position_summary
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [71]:
#GK
gk_summary = positions_dict[1]
gk_summary = gk_summary.sort_values("var_adjusted_points", ascending = False)
gk_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].head()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
538,Hugo_Lloris_340,0.263992,-0.31168,0.087299
480,Dean_Henderson_471,0.227722,0.061054,0.09159
145,Nick_Pope_93,0.214183,0.104823,0.09458
303,Alisson_Ramses Becker_189,0.141604,0.181795,0.069265
272,Kasper_Schmeichel_168,0.126589,-0.095015,0.07814


In [72]:
#DEF
def_summary = positions_dict[2]
def_summary = def_summary.sort_values("var_adjusted_points", ascending = False)
def_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].head()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
295,Andrew_Robertson_181,0.706051,-0.141139,0.072074
296,Trent_Alexander-Arnold_182,0.527451,-0.041697,0.074657
638,Willy_Boly_405,0.518488,-0.197577,0.090796
297,Virgil_van Dijk_183,0.439406,-0.148372,0.072547
635,Matt_Doherty_401,0.418146,0.030027,0.075409


In [77]:
def_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].tail()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
409,Danny_Rose_332,-0.492081,0.31449,0.038153
429,Maximillian_Aarons_274,-0.493855,-0.070257,0.042226
567,Adrian_Mariappa_356,-0.699164,0.104997,0.031033
172,Andreas_Christensen_108,-0.743209,-0.087613,0.033129
428,Ben_Godfrey_273,-1.116094,-0.099596,0.02862


In [78]:
#MID
mid_summary = positions_dict[3]
mid_summary = mid_summary.sort_values("var_adjusted_points", ascending = False)
mid_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].head(5)

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
305,Mohamed_Salah_191,0.805814,-0.123169,0.054963
306,Sadio_Mané_192,0.792842,-0.380376,0.052472
338,Kevin_De Bruyne_215,0.792109,-0.440722,0.069719
372,Anthony_Martial_239,0.722756,-0.273374,0.078914
540,Heung-Min_Son_342,0.6317,0.06131,0.05804
337,Raheem_Sterling_214,0.584377,-0.140433,0.051991
185,Christian_Pulisic_431,0.532141,0.150667,0.070473
241,Richarlison_de Andrade_150,0.510383,-0.156769,0.057146
340,Riyad_Mahrez_217,0.507353,-0.169301,0.062727
342,David_Silva_219,0.498506,-0.322567,0.069119


In [76]:
mid_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].tail()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
377,Frederico_Rodrigues de Paula Santos_244,-1.625404,0.320755,0.032569
545,Harry_Winks_347,-1.692204,0.045455,0.031767
64,Marvelous_Nakamba_491,-1.873416,-0.072008,0.033699
440,Marco_Stiepermann_285,-2.181898,-0.2205,0.025325
217,James_McCarthy_157,-2.290261,0.049293,0.028643


In [74]:
#FWD
fwd_summary = positions_dict[4]
fwd_summary = fwd_summary.sort_values("var_adjusted_points", ascending = False)
fwd_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].head()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
366,Marcus_Rashford_233,0.582611,-0.179344,0.065186
270,Jamie_Vardy_166,0.528548,0.102796,0.063422
642,Raúl_Jiménez_409,0.527605,0.074623,0.067072
10,Pierre-Emerick_Aubameyang_11,0.520473,-0.19149,0.05206
536,Harry_Kane_338,0.502042,-0.012446,0.049795


In [75]:
fwd_summary[["name", "var_adjusted_points", "points_coef", "yield_mean"]].tail()

Unnamed: 0,name,var_adjusted_points,points_coef,yield_mean
410,Joelinton Cássio_Apolinário de Lira_466,-0.897012,-0.131426,0.036246
205,Christian_Benteke_129,-0.929532,-0.027679,0.030457
575,Andre_Gray_364,-0.951289,-0.058723,0.029517
109,Glenn_Murray_44,-1.491435,-0.002896,0.028188
446,Josip_Drmic_434,-1.794509,-0.185257,0.023145
