In [1]:
from itertools import combinations

In [2]:
import pandas as pd
import numpy as np

In [3]:
# import results data for the current season as of Nov 23 2018
results = pd.read_csv('http://www.football-data.co.uk/mmz4281/1819/E0.csv')

In [4]:
# results.to_csv('pl_results_20181123.csv', encoding='utf-8', index=False)

In [5]:
results.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,E0,10/08/18,Man United,Leicester,2,1,H,1,0,H,...,1.79,17,-0.75,1.75,1.7,2.29,2.21,1.55,4.07,7.69
1,E0,11/08/18,Bournemouth,Cardiff,2,0,H,1,0,H,...,1.83,20,-0.75,2.2,2.13,1.8,1.75,1.88,3.61,4.7
2,E0,11/08/18,Fulham,Crystal Palace,0,2,A,0,1,A,...,1.87,22,-0.25,2.18,2.11,1.81,1.77,2.62,3.38,2.9
3,E0,11/08/18,Huddersfield,Chelsea,0,3,A,0,2,A,...,1.84,23,1.0,1.84,1.8,2.13,2.06,7.24,3.95,1.58
4,E0,11/08/18,Newcastle,Tottenham,1,2,A,1,2,A,...,1.81,20,0.25,2.2,2.12,1.8,1.76,4.74,3.53,1.89


In [6]:
# create modified results dataframe in required format
home_results = (results
               .assign(points = results.FTR.map({'H': 3, 'A': 0, 'D': 1}),
                       venue = 'home')
               .rename(columns = {
                   'HomeTeam': 'team',
                   'AwayTeam': 'opponent'
               })
               .loc[:, ['team','opponent','venue','points']])

away_results = (results
               .assign(points = results.FTR.map({'A': 3, 'H': 0, 'D': 1}),
                       venue = 'away')
               .rename(columns = {
                   'AwayTeam': 'team',
                   'HomeTeam': 'opponent'
               })
               .loc[:, ['team','opponent','venue','points']])

modified_results = pd.concat([home_results, away_results], ignore_index=True, sort=False)

In [7]:
modified_results.head()

Unnamed: 0,team,opponent,venue,points
0,Man United,Leicester,home,3
1,Bournemouth,Cardiff,home,3
2,Fulham,Crystal Palace,home,0
3,Huddersfield,Chelsea,home,0
4,Newcastle,Tottenham,home,0


In [8]:
teams = pd.unique(results.HomeTeam)
teams.sort()

In [9]:
teams

array(['Arsenal', 'Bournemouth', 'Brighton', 'Burnley', 'Cardiff',
       'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield',
       'Leicester', 'Liverpool', 'Man City', 'Man United', 'Newcastle',
       'Southampton', 'Tottenham', 'Watford', 'West Ham', 'Wolves'],
      dtype=object)

In [10]:
# create the W matrix
mapping_table = pd.DataFrame(index=list(combinations(teams, 2)), columns=teams).fillna(0)

In [11]:
for idx, row in mapping_table.iterrows():
    mapping_table.loc[idx, idx[0]] = 1
    mapping_table.loc[idx, idx[1]] = -1

In [12]:
mapping_table.head()

Unnamed: 0,Arsenal,Bournemouth,Brighton,Burnley,Cardiff,Chelsea,Crystal Palace,Everton,Fulham,Huddersfield,Leicester,Liverpool,Man City,Man United,Newcastle,Southampton,Tottenham,Watford,West Ham,Wolves
"(Arsenal, Bournemouth)",1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"(Arsenal, Brighton)",1,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"(Arsenal, Burnley)",1,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"(Arsenal, Cardiff)",1,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"(Arsenal, Chelsea)",1,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
def calculate_points_per_game_difference(teams):
    """Returns points per game difference for teams passed as input."""
    common_games = (modified_results[modified_results.team == teams[0]]
                    .replace({'opponent': f'{teams[1]}'}, 'H2H')
                    .assign(venue = lambda x: np.where(x.opponent == 'H2H', 'H2H', x.venue))
                    .merge((modified_results[modified_results.team == teams[1]]
                            .replace({'opponent': f'{teams[0]}'}, 'H2H')
                            .assign(venue = lambda x: np.where(x.opponent == 'H2H', 'H2H', x.venue))),
                           how='inner',
                           on=['opponent','venue']))
    
    if not common_games.empty:
        points_per_game_difference = ((common_games.points_x.sum()
                                       - common_games.points_y.sum())
                                      / common_games.shape[0])
    else:
        points_per_game_difference = 0
    
    return points_per_game_difference

In [14]:
# convert mapping table to numpy array to create 'W'
W = mapping_table.values

In [15]:
# create 'r' using the function defined above
r = (mapping_table
     .index
     .map(calculate_points_per_game_difference)
     .values
     .reshape(mapping_table.shape[0], 1))

In [16]:
# minimizing sum of squared errors to get x
x = np.linalg.lstsq(W, r, rcond=None)[0]

In [17]:
# unscaled x values for all clubs
sorted(zip(teams, x), key=lambda x: x[1], reverse=True)

[('Man City', array([1.41821429])),
 ('Liverpool', array([1.19083333])),
 ('Tottenham', array([1.07666667])),
 ('Arsenal', array([0.83])),
 ('Chelsea', array([0.81583333])),
 ('Man United', array([0.47952381])),
 ('Everton', array([0.2372619])),
 ('Watford', array([0.15119048])),
 ('Wolves', array([0.11583333])),
 ('Bournemouth', array([0.11464286])),
 ('Leicester', array([0.0475])),
 ('Brighton', array([-0.17583333])),
 ('West Ham', array([-0.2325])),
 ('Crystal Palace', array([-0.61083333])),
 ('Newcastle', array([-0.78880952])),
 ('Huddersfield', array([-0.795])),
 ('Burnley', array([-0.82869048])),
 ('Southampton', array([-0.89869048])),
 ('Cardiff', array([-0.93297619])),
 ('Fulham', array([-1.21416667]))]

In [18]:
x.sum()

-3.0531133177191805e-16

In [19]:
modified_results.points.mean()

1.3875

In [20]:
# multiplier used for scaling up equals number of matches played by team
m = (modified_results
     .groupby('team')
     .points
     .count()
     .values
     .reshape(len(mapping_table.columns), 1))

In [21]:
x + modified_results.points.mean()

array([[2.2175    ],
       [1.50214286],
       [1.21166667],
       [0.55880952],
       [0.45452381],
       [2.20333333],
       [0.77666667],
       [1.6247619 ],
       [0.17333333],
       [0.5925    ],
       [1.435     ],
       [2.57833333],
       [2.80571429],
       [1.86702381],
       [0.59869048],
       [0.48880952],
       [2.46416667],
       [1.53869048],
       [1.155     ],
       [1.50333333]])

In [22]:
# add average points per team per game to x and multiply by number of matches
adjusted_points = m * (x + modified_results.points.mean())

In [23]:
sorted(zip(teams, adjusted_points), key=lambda x: x[1], reverse=True)

[('Man City', array([33.66857143])),
 ('Liverpool', array([30.94])),
 ('Tottenham', array([29.57])),
 ('Arsenal', array([26.61])),
 ('Chelsea', array([26.44])),
 ('Man United', array([22.40428571])),
 ('Everton', array([19.49714286])),
 ('Watford', array([18.46428571])),
 ('Wolves', array([18.04])),
 ('Bournemouth', array([18.02571429])),
 ('Leicester', array([17.22])),
 ('Brighton', array([14.54])),
 ('West Ham', array([13.86])),
 ('Crystal Palace', array([9.32])),
 ('Newcastle', array([7.18428571])),
 ('Huddersfield', array([7.11])),
 ('Burnley', array([6.70571429])),
 ('Southampton', array([5.86571429])),
 ('Cardiff', array([5.45428571])),
 ('Fulham', array([2.08]))]