In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
season1 = pd.read_csv('match-by-match/leeds-matches-2021.csv')
season1.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee
0,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Away,L,3,4,Liverpool,0.3,2.7,51,,Luke Ayling,4-1-4-1,Michael Oliver
1,2020-09-16,19:45,EFL Cup,Second round,Wed,Home,D,1 (8),1 (9),Hull City,,,65,,Kiko Casilla,4-1-4-1,David Webb
2,2020-09-19,15:00,Premier League,Matchweek 2,Sat,Home,W,4,3,Fulham,1.4,1.7,51,,Liam Cooper,4-1-4-1,Anthony Taylor
3,2020-09-27,12:00,Premier League,Matchweek 3,Sun,Away,W,1,0,Sheffield Utd,1.2,1.4,64,,Liam Cooper,3-1-4-2,Paul Tierney
4,2020-10-03,17:30,Premier League,Matchweek 4,Sat,Home,D,1,1,Manchester City,2.4,1.2,51,,Liam Cooper,4-1-4-1,Mike Dean


In [3]:
season2 = pd.read_csv('match-by-match/leeds-matches-2122.csv')
season2.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee
0,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Away,L,1,5,Manchester Utd,0.6,2.5,51,72732,Liam Cooper,4-1-4-1,Paul Tierney
1,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,D,2,2,Everton,1.4,2.0,69,36293,Liam Cooper,3-3-3-1,Darren England
2,2021-08-24,19:45,EFL Cup,Second round,Tue,Home,W,3,0,Crewe Alexandra,,,73,34154,Kalvin Phillips,4-2-3-1,Ben Speedie
3,2021-08-29,14:00,Premier League,Matchweek 3,Sun,Away,D,1,1,Burnley,1.6,1.1,64,20000,Liam Cooper,3-3-3-1,Michael Oliver
4,2021-09-12,16:30,Premier League,Matchweek 4,Sun,Home,L,0,3,Liverpool,1.1,4.4,44,36507,Liam Cooper,4-1-4-1,Craig Pawson


In [4]:
def filter_res(df):
    df = df.loc[df['Comp'] == 'Premier League']
    df = df[['Date', 'Round', 'Result', 'Venue', 'GF', 'GA', 'xG', 'xGA']]
    return df

In [5]:
season1 = filter_res(season1)
season2 = filter_res(season2)

In [6]:
season1.head()

Unnamed: 0,Date,Round,Result,Venue,GF,GA,xG,xGA
0,2020-09-12,Matchweek 1,L,Away,3,4,0.3,2.7
2,2020-09-19,Matchweek 2,W,Home,4,3,1.4,1.7
3,2020-09-27,Matchweek 3,W,Away,1,0,1.2,1.4
4,2020-10-03,Matchweek 4,D,Home,1,1,2.4,1.2
5,2020-10-19,Matchweek 5,L,Home,0,1,1.0,0.4


In [7]:
season2.head()

Unnamed: 0,Date,Round,Result,Venue,GF,GA,xG,xGA
0,2021-08-14,Matchweek 1,L,Away,1,5,0.6,2.5
1,2021-08-21,Matchweek 2,D,Home,2,2,1.4,2.0
3,2021-08-29,Matchweek 3,D,Away,1,1,1.6,1.1
4,2021-09-12,Matchweek 4,L,Home,0,3,1.1,4.4
5,2021-09-17,Matchweek 5,D,Away,1,1,1.8,1.6


### Expected Points Calculator
Typically an xP (expected points) calculator is based on each individual shot taken in a game by both teams. We don't have access to this data anywhere so instead we will follow another approach based on the overall team xG that utilizes a Poisson distribution seen [here](https://github.com/kostino/ExpectedPointsCalculator/blob/master/xPoints_Barcelona-RealMadrid_example.ipynb).

In [8]:
from scipy.stats import poisson
max_goals_pl = 9 # max goals scored in PL history is 9, assumed as no higher

def xp_calculator(df):

    xGLeeds = list(df['xG'])
    xGAway = list(df['xGA'])
    LeedsxP = []
    AwayxP = []
    for j in range(len(xGLeeds)):
        gProbsLeeds = [poisson.pmf(i, xGLeeds[j]) for i in range(max_goals_pl)]
        gProbsAway = [poisson.pmf(i, xGAway[j]) for i in range(max_goals_pl)]

        resultProbs = np.matmul(np.array(gProbsLeeds).reshape(max_goals_pl, 1),np.array(gProbsAway).reshape(1, max_goals_pl))

        LeedsWP = np.tril(resultProbs).sum() - np.trace(resultProbs)
        AwayWP = np.triu(resultProbs).sum() - np.trace(resultProbs)
        DrawP = np.trace(resultProbs)

        LeedsxPVals = LeedsWP * 3 + DrawP * 1
        AwayxPVals = AwayWP * 3 + DrawP * 1
        LeedsxP.append(LeedsxPVals)
        AwayxP.append(AwayxPVals)
    df['xPLeeds'] = LeedsxP
    df['xPOpp'] = AwayxP
    return df

In [9]:
xp_s1_leeds = xp_calculator(season1)

In [10]:
xp_s2_leeds = xp_calculator(season2)
xp_s2_leeds.head()

Unnamed: 0,Date,Round,Result,Venue,GF,GA,xG,xGA,xPLeeds,xPOpp
0,2021-08-14,Matchweek 1,L,Away,1,5,0.6,2.5,0.34887,2.505102
1,2021-08-21,Matchweek 2,D,Home,2,2,1.4,2.0,1.022361,1.760305
3,2021-08-29,Matchweek 3,D,Away,1,1,1.6,1.1,1.717495,1.03345
4,2021-09-12,Matchweek 4,L,Home,0,3,1.1,4.4,0.191414,2.63574
5,2021-09-17,Matchweek 5,D,Away,1,1,1.8,1.6,1.51153,1.262684


In [14]:
#season1.to_csv('xPData/less-headers.csv', index = False)
#season2.to_csv('xPData/less-headers-2122-lee.csv', index = False)