Goals

be able to access all data in S3 for a given season
- events table
- player and team match stats
- lineups and missing players table
- odds table


our end goal is to have player and team tables for the season which will facilitate making our features very easily. 
that means we should have a player table of every performance in the league with vaep, xG, rest days (have to incorporate european fixtures), travel distance (have to manually get coordinates for stadiums),  

then wrangle event data to get 

In [129]:
import boto3
from dotenv import load_dotenv
import os
import warnings
from io import StringIO
import pandas as pd
import socceraction.spadl as spadl
from tqdm import tqdm
import numpy as np
import xgboost

import sys
sys.path.append('..')
import utils
tqdm.pandas()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

load_dotenv()
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_access = os.getenv('AWS_SECRET_ACCESS')
aws_region = os.getenv('AWS_REGION')

s3 = boto3.client('s3',
                aws_access_key_id=aws_access_key,
                aws_secret_access_key=aws_secret_access,
                region_name=aws_region)

bucket = 'footballbets'
league = "ENG-Premier League"
season = 2223

In [2]:
spadl_e = s3.get_object(Bucket=bucket, Key=f'ENG-Premier League/2223/events_spadl.csv')
spadldf = pd.read_csv(StringIO(spadl_e['Body'].read().decode('utf-8')))
spadldf = spadl.add_names(spadldf)

scheduler = s3.get_object(Bucket=bucket, Key=f'ENG-Premier League/2223/schedule.csv')
schedule = pd.read_csv(StringIO(scheduler['Body'].read().decode('utf-8')))

spadldf = spadldf.merge(schedule[['game', 'home_team_id', 'ws_game_id']].rename(columns={'game':'fixture'}), how='left', left_on='game_id', right_on='ws_game_id')



spadldf['prevEvent'] = spadldf.shift(1, fill_value=0)['type_name']
spadldf['nextEvent'] = spadldf.shift(-1, fill_value=0)['type_name']
spadldf['nextTeamId'] = spadldf.shift(-1, fill_value=0)['team_id']

## Possession Sequences

In [None]:
import importlib
importlib.reload(utils)

In [152]:
spadldf = utils.get_season_possessions(spadldf)

100%|██████████| 380/380 [01:04<00:00,  5.91it/s]


## xG

In [35]:
import math
import numpy as np
def calculate_relative_angle(x, y):
    try:
        # Coordinates of the goalposts
        goal_x = 105
        left_goalpost_y = 30.34
        right_goalpost_y = 37.66
        goal_center_y = 34
        
        # Validate inputs
        if not (0 <= x <= 105) or not (0 <= y <= 68):
            raise ValueError("Coordinates out of bounds.")
        
        # Calculate the angles from the player's position to each goalpost
        angle_to_left_post = math.atan2(left_goalpost_y - y, goal_x - x)
        angle_to_right_post = math.atan2(right_goalpost_y - y, goal_x - x)
        
        # Convert angles to degrees
        angle_to_left_post_deg = math.degrees(angle_to_left_post)
        angle_to_right_post_deg = math.degrees(angle_to_right_post)
        
        # Calculate the central angle (angle to the goal center)
        central_angle = math.atan2(goal_center_y - y, goal_x - x)
        central_angle_deg = math.degrees(central_angle)
        
        # Determine the relative angle
        if y > goal_center_y:
            relative_angle = abs(angle_to_left_post_deg - central_angle_deg) / 45
        else:
            relative_angle = abs(angle_to_right_post_deg - central_angle_deg) / 45
        
        # Ensure the relative angle is between 0 and 1
        relative_angle = max(0, min(1, relative_angle))
        
        return relative_angle
    
    except ValueError as e:
        print(f"ValueError: {e}")
        return None

def calculate_inverse(value):
    try:
        inverse = 1 / value
        return inverse
    except ZeroDivisionError:
        return float('inf')  # Return infinity if the value is zero

In [37]:
# Calculate type of possession ('goalkick', 'freekick_short', 'freekick_crossed', 'corner_crossed', 'corner_short')
def calculate_play_type(group):
    first_event = group.iloc[0]
    highest_x_event = group.loc[group['start_x'].idxmax()]

    if first_event.type_name in ['goalkick', 'freekick_short', 'freekick_crossed', 'corner_crossed', 'corner_short']:
        return first_event.type_name.split('_')[0]


    if ((highest_x_event.time_seconds - first_event.time_seconds < 13) and 
            (highest_x_event.start_x > 85) and (first_event.start_x < 55) and (first_event.type_name not in ['goalkick', 'freekick_short', 'freekick_crossed', 'foul', 'corner_crossed', 'corner_short'])):
        return 'counter'
    
    return 'normal'

def get_xg_features(spadldf):

    # xG Distance and Angle
    spadldf['xG_distance'] = np.sqrt((spadldf['start_x'] - 105)**2 + (spadldf['start_y'] - 34)**2)
    spadldf['xG_angle'] = spadldf.apply(lambda x: calculate_relative_angle(x.start_x, x.start_y), axis=1)
    spadldf['xG_distance_inv'] = 1 / spadldf.xG_distance
    spadldf['xG_angle_inv'] = 1 / spadldf.xG_angle
    spadldf['dist_ang_inv'] = 1 / (spadldf.xG_distance * spadldf.xG_angle)

    # Previous Play Types (Dribble, Throughball, Cutback, Cross, Corner, Freekick)
    spadldf['pass_d'] = np.sqrt((spadldf['start_x'] - spadldf['end_x'])**2 + (spadldf['start_y'] - spadldf['end_y'])**2)
    spadldf['is_forward'] = spadldf.start_x < spadldf.end_x
    spadldf['pass_angle'] = np.degrees(np.arctan2(spadldf['end_y'] - spadldf['start_y'], spadldf['end_x'] - spadldf['start_x']))
    spadldf['is_wide_area'] = ((spadldf['start_y'] < 20) | (spadldf['start_y'] > 48)) & (spadldf['start_x'] > 88)

    through_conditions = (spadldf['is_forward']) & (spadldf['pass_d'] > 15) & (abs(spadldf['pass_angle']) < 45) & (spadldf['type_name'] == 'pass') & (spadldf['nextEvent'] == 'shot')
    cutback_conditions = (~spadldf['is_forward']) & (spadldf['is_wide_area']) & (4 < spadldf['pass_d']) & (spadldf['pass_d'] < 20) & (abs(spadldf['pass_angle']) > 30) & (spadldf['type_name'] == 'pass') & (spadldf['nextEvent'] == 'shot')

    spadldf['is_cutback'] = pd.Series(np.where(cutback_conditions, True, False)).shift(1)
    spadldf['is_throughball'] = pd.Series(np.where(through_conditions, True, False)).shift(1)
    spadldf['after_dribble'] = np.where((spadldf['type_name'] == 'shot') & (spadldf['prevEvent'].isin(['dribble', 'takeon'])), True, False)

    spadldf['is_corner'] = np.where(((spadldf['type_name'] == 'shot') & (spadldf['prevEvent'] == 'cross')), True, False)
    spadldf['is_cross'] = np.where(((spadldf['type_name'] == 'shot') & (spadldf['prevEvent'] == 'corner_crossed')), True, False)
    spadldf['is_freekick'] = np.where(((spadldf['type_name'] == 'shot') & (spadldf['prevEvent'] == 'freekick_crossed')), True, False)

    # if goal was scored and body part used
    spadldf['body'] = np.where((spadldf.bodypart_name.isin(['foot', 'foot_right', 'foot_left'])), 'foot', 'header')
    spadldf['goal'] = np.where((spadldf.type_name == 'shot') & (spadldf.result_name == 'success'), True, False)

    # Takes long time
    spadldf['play_type'] = spadldf.groupby('possession_chain')['start_x'].transform(lambda x: calculate_play_type(spadldf.loc[x.index]))

    return spadldf


In [90]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

scaler = MinMaxScaler()

# Split into foot and header shots / Get Dummies / Normalize Values
num_cols = ['xG_distance', 'xG_angle', 'xG_distance_inv', 'xG_angle_inv', 'dist_ang_inv']
bool_cols = ['is_corner', 'is_cross', 'is_freekick', 'is_cutback', 'is_throughball', 'after_dribble', 'play_type', 'goal']

nor_shots = spadldf[(spadldf['type_name'].isin(['shot', 'shot_freekick'])) & (spadldf['body'] == 'foot')][num_cols + bool_cols]
nor_shots = pd.concat([nor_shots, pd.get_dummies(nor_shots['play_type'])], axis=1).drop('play_type', axis=1)

headers = spadldf[(spadldf['type_name'] == 'shot') & (spadldf['body'] == 'header')][num_cols + bool_cols]
headers = pd.concat([headers, pd.get_dummies(headers['play_type'])], axis=1).drop('play_type', axis=1)
bool_cols.remove('play_type')

nor_shots[num_cols] = scaler.fit_transform(nor_shots[num_cols])
nor_shots[bool_cols] = nor_shots[bool_cols].astype(int)

headers[num_cols] = scaler.fit_transform(headers[num_cols])
headers[bool_cols] = headers[bool_cols].astype(int)

# Train Data

X_f = nor_shots.drop('goal', axis=1)
y_f = nor_shots['goal']
X_h = headers.drop('goal', axis=1)
y_h = headers['goal']

head_model = LogisticRegression()
foot_model = LogisticRegression()

head_model.fit(X_h, y_h)
foot_model.fit(X_f, y_f)

y_pred_f = foot_model.predict_proba(X_f)[:, 1]
y_pred_h = head_model.predict_proba(X_h)[:, 1]

# get back into normal dataframe

X_f['xG'] = y_pred_f
X_h['xG'] = y_pred_h

spadldf = spadldf.merge(X_h['xG'], how='left', left_index=True, right_index=True)
spadldf = spadldf.merge(X_f['xG'], how='left', left_index=True, right_index=True)
spadldf['xG'] = spadldf['xG_x'].combine_first(spadldf['xG_y'])
spadldf.drop(['xG_x', 'xG_y'], axis=1, inplace=True)

spadldf.loc[spadldf.type_name == 'shot_penalty', 'xG'] = .76

0.12401667505744018

In [154]:
from xg import xg
xgm = xg(spadldf)
spadldf['xG'] = xgm.get_xg()

Calculating play types: 100%|██████████| 112764/112764 [01:05<00:00, 1733.17it/s]


In [116]:
xG_ser = spadldf[spadldf['type_name'].isin(['shot', 'shot_freekick', 'shot_penalty'])].xG
ground_truth = spadldf[spadldf['type_name'].isin(['shot', 'shot_freekick', 'shot_penalty'])].result_id

metrics.r2_score(ground_truth, xG_ser)

0.15537494505886906

In [155]:
spadldf[['player', 'xg_2']].groupby('player').sum().sort_values('xg_2', ascending=False).head(10)

Unnamed: 0_level_0,xg_2
player,Unnamed: 1_level_1
Erling Haaland,23.214673
Mohamed Salah,19.236746
Harry Kane,18.598679
Ivan Toney,17.722215
Callum Wilson,14.250796
Aleksandar Mitrovic,13.661042
Ollie Watkins,13.387657
Gabriel Jesus,12.715813
Marcus Rashford,12.38376
Dominic Solanke,11.578854


## VAEP

In [None]:
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

gamestates = fs.gamestates(spadldf, 3)
xfns = [
            fs.actiontype_onehot,
            fs.result_onehot,
            fs.bodypart_onehot,
            fs.startlocation,
            fs.endlocation,
            fs.startpolar,
            fs.endpolar,
            fs.movement,
            fs.time_delta,
            fs.space_delta,
            fs.goalscore,
            fs.time,
        ]

yfns = [lab.scores, lab.concedes]

X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
y = pd.concat([fn(spadldf) for fn in yfns], axis=1)

<module 'utils' from 'c:\\Users\\rsacc\\Desktop\\footballbets\\data\\notebooks\\..\\utils.py'>

In [132]:
pscores = utils.ProbabityModel(
    model=xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1),
    model_type="classifier"
)
pscores.train(X, y[["scores"]])

pconcedes = utils.ProbabityModel(
    model=xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1),
    model_type="classifier"
)
pconcedes.train(X, y[["concedes"]])


In [134]:
models = {
  "scores": pscores,
  "concedes": pconcedes
}

y_hat = pd.DataFrame(columns=['scores', 'concedes'])

for col in ['scores', 'concedes']:
    y_hat[col] = models[col].predict(X)

In [135]:
import socceraction.vaep.formula as vaepformula
values = vaepformula.value(spadldf, y_hat["scores"], y_hat["concedes"])
spadldf = pd.concat([spadldf, values], axis=1)

In [148]:
spadldf[['player', 'vaep_value']].groupby('player').sum().sort_values('vaep_value', ascending=False).head(10)

Unnamed: 0_level_0,vaep_value
player,Unnamed: 1_level_1
Kieran Trippier,17.933631
Martin Ødegaard,16.865478
Harry Kane,15.117221
Kevin De Bruyne,14.192955
Trent Alexander-Arnold,13.158614
Erling Haaland,12.971816
Mathias Jensen,12.927472
Gabriel Martinelli,11.260491
Marcus Rashford,11.15064
James Ward-Prowse,10.736066
