In [1]:
import pandas as pd
import numpy as np

import math
import ast
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [2]:
###########################
# This notebook loads StatsBomb's open data and calculates the various more advanced features that we'll ues as heuristics to input into our models. It covers the four heads of features we're looking to add: location-based, goalkeeper, and defensive pressure statistics, as well as features based on our various hypotheses.
###########################

In [3]:
all_shots = pd.read_csv("./data/all_shots.csv")
all_shots = all_shots.dropna(axis=1, how="all")

In [4]:
all_shots.shape

(84065, 43)

In [5]:
all_shots = all_shots[~all_shots["competition_id"].isin(["FIFA U20 World Cup", "North American League", "Liga Profesional", "Indian Super League"])].drop(columns=all_shots.columns[all_shots.nunique() == 0]).sort_values(by=["match_id", "period", "minute", "second"])
all_shots.nunique()

season_id                   42
match_id                  3312
duration                 77926
id                       83929
index                     4465
location                 56583
minute                     139
off_camera                   1
out                          1
period                       5
play_pattern                 9
player                    5638
player_id                 5637
position                    25
possession                 290
possession_team            288
possession_team_id         291
related_events           83929
second                      60
shot_aerial_won              1
shot_body_part               4
shot_end_location        51584
shot_first_time              1
shot_freeze_frame        82866
shot_key_pass_id         59111
shot_one_on_one              1
shot_outcome                 8
shot_statsbomb_xg        82417
shot_technique               7
shot_type                    5
team                       288
team_id                    291
timestam

In [6]:
all_shots[["season_id", "match_id", "id", "index", "location", "period", "minute", "second", "player", "player_id", "position", "possession", "possession_team", "possession_team_id", "team", "team_id", "timestamp", "competition_id", "shot_outcome"]].sort_values(by=["match_id", "period", "minute", "second"])

Unnamed: 0,season_id,match_id,id,index,location,period,minute,second,player,player_id,position,possession,possession_team,possession_team_id,team,team_id,timestamp,competition_id,shot_outcome
15349,2018/2019,7298.0,9b82eaa3-2048-4157-aa9a-eabeb4fa0ebe,42.0,"[115.0, 25.0]",1.0,0.0,47.0,Francesca Kirby,4641.0,Right Center Forward,4.0,Chelsea FCW,971.0,Chelsea FCW,971.0,00:00:47.620,FA Women's Super League,Blocked
15350,2018/2019,7298.0,25dace9c-6bf8-4ada-8a4f-bad0485141c9,237.0,"[109.0, 51.0]",1.0,5.0,12.0,Francesca Kirby,4641.0,Right Center Forward,15.0,Chelsea FCW,971.0,Chelsea FCW,971.0,00:05:12.780,FA Women's Super League,Blocked
15351,2018/2019,7298.0,5e58cab7-75c2-47f8-903c-2874de6ed5b0,243.0,"[99.0, 52.0]",1.0,5.0,41.0,So-Yun Ji,4647.0,Center Midfield,16.0,Chelsea FCW,971.0,Chelsea FCW,971.0,00:05:41.940,FA Women's Super League,Blocked
15352,2018/2019,7298.0,624a8c1d-b775-4a4f-85e8-a516aed3f3a5,248.0,"[107.0, 40.0]",1.0,5.0,43.0,Drew Spence,4638.0,Left Center Midfield,16.0,Chelsea FCW,971.0,Chelsea FCW,971.0,00:05:43.900,FA Women's Super League,Blocked
15353,2018/2019,7298.0,3f0fc8e9-a09f-480a-9396-132e1ca05ec5,256.0,"[108.0, 32.0]",1.0,5.0,46.0,Millie Bright,4642.0,Right Center Back,16.0,Chelsea FCW,971.0,Chelsea FCW,971.0,00:05:46.380,FA Women's Super League,Goal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7849,2023,3923881.0,cf6ded0e-b416-47ee-82da-ae6810e02ebf,2274.0,"[109.9, 41.3]",2.0,73.0,55.0,Sébastien Haller,8673.0,Center Forward,137.0,Côte d'Ivoire,3374.0,Côte d'Ivoire,3374.0,00:28:55.274,African Cup of Nations,Off T
7850,2023,3923881.0,de5bcc33-6ea2-4797-8712-a3db7cfe31e0,2344.0,"[115.4, 36.5]",2.0,80.0,14.0,Sébastien Haller,8673.0,Center Forward,140.0,Côte d'Ivoire,3374.0,Côte d'Ivoire,3374.0,00:35:14.269,African Cup of Nations,Goal
7851,2023,3923881.0,bded325e-4b96-41d1-992d-3942344b5902,2483.0,"[102.9, 46.8]",2.0,89.0,12.0,Kelechi Promise Iheanacho,3708.0,Right Wing,149.0,Nigeria,775.0,Nigeria,775.0,00:44:12.865,African Cup of Nations,Blocked
7852,2023,3923881.0,3e369ccb-5ae2-45cf-8eb8-f24aec5e9e9d,2623.0,"[97.4, 55.3]",2.0,93.0,55.0,Wilfried Stephane Singo,36539.0,Right Back,159.0,Côte d'Ivoire,3374.0,Côte d'Ivoire,3374.0,00:48:55.619,African Cup of Nations,Wayward


In [7]:
# First, we'll append statistics related to the pass that led to the shot to the shots dataframe.
all_passes = pd.read_csv("./data/all_passes.csv")
all_passes = all_passes.dropna(axis=1, how="all")

In [8]:
key_passes = all_passes[all_passes["id"].isin(all_shots['shot_key_pass_id'])]

In [9]:
key_passes[["duration", "pass_angle", "pass_type", "pass_height", "pass_length", "pass_assisted_shot_id"]]

Unnamed: 0,duration,pass_angle,pass_type,pass_height,pass_length,pass_assisted_shot_id
46,1.707827,-1.850475,Corner,High Pass,39.848340,fd99f442-49ae-4724-87e8-2512e4a5d9c6
84,1.635672,1.716809,Corner,High Pass,37.802250,4aaf9747-c33d-4f76-bb0f-d25ad1ebe7c6
234,1.320689,1.778707,Corner,High Pass,37.303352,749a19ff-d599-4170-a095-79b9865e2b06
268,1.350540,-1.871810,Corner,Low Pass,35.077200,37cdb09d-6504-48cb-a6cc-217790d53b4b
307,1.571437,-1.551568,Recovery,High Pass,10.401923,cffe1d92-0692-47ad-a71b-208c8a52dcc0
...,...,...,...,...,...,...
686852,2.271377,-1.125219,Recovery,Ground Pass,22.274874,2c92a25a-2014-4080-b8fd-b69d72092898
686919,1.689300,0.895606,Free Kick,High Pass,34.717430,828295e4-feac-43c4-8b4a-47617f6d2408
686929,1.788300,-1.394087,Free Kick,High Pass,42.664387,58997323-8e35-474d-815c-7926e1e143ba
686981,1.010200,1.630133,Corner,High Pass,30.353418,4201f68c-764d-45bd-bcfb-96470a72c2e8


In [10]:
key_passes["pass_duration"] = key_passes["duration"]

all_shots = pd.merge(all_shots, key_passes[["pass_duration", "pass_angle", "pass_type", "pass_height", "pass_length", "pass_assisted_shot_id"]], how='left', left_on='id', right_on='pass_assisted_shot_id', suffixes=("", ""))
all_shots = all_shots.drop("pass_assisted_shot_id", axis=1)

In [11]:
possession_passes = all_passes.groupby(["match_id", "possession"]).agg({
    'match_id': 'first',
    'possession': 'first',
    'duration': 'count'
}).reset_index(drop=True)
possession_passes["num_passes"] = possession_passes["duration"]
possession_passes = possession_passes.drop("duration", axis=1)

all_shots = pd.merge(all_shots, possession_passes, how='left', on=["match_id", "possession"])

In [12]:
all_shots

Unnamed: 0,season_id,match_id,duration,id,index,location,minute,off_camera,out,period,...,shot_saved_off_target,shot_saved_to_post,shot_follows_dribble,shot_kick_off,pass_duration,pass_angle,pass_type,pass_height,pass_length,num_passes
0,2018/2019,7298.0,0.560000,9b82eaa3-2048-4157-aa9a-eabeb4fa0ebe,42.0,"[115.0, 25.0]",0.0,,,1.0,...,,,,,,,,,,
1,2018/2019,7298.0,0.400000,25dace9c-6bf8-4ada-8a4f-bad0485141c9,237.0,"[109.0, 51.0]",5.0,,,1.0,...,,,,,,,,,,1.0
2,2018/2019,7298.0,0.480000,5e58cab7-75c2-47f8-903c-2874de6ed5b0,243.0,"[99.0, 52.0]",5.0,,,1.0,...,,,,,,,,,,2.0
3,2018/2019,7298.0,0.160000,624a8c1d-b775-4a4f-85e8-a516aed3f3a5,248.0,"[107.0, 40.0]",5.0,,,1.0,...,,,,,,,,,,2.0
4,2018/2019,7298.0,1.480000,3f0fc8e9-a09f-480a-9396-132e1ca05ec5,256.0,"[108.0, 32.0]",5.0,,,1.0,...,,,,,0.96,-0.950547,Recovery,Ground Pass,8.602325,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,0.921817,cf6ded0e-b416-47ee-82da-ae6810e02ebf,2274.0,"[109.9, 41.3]",73.0,,,2.0,...,,,,,,,,,,1.0
83925,2023,3923881.0,0.668012,de5bcc33-6ea2-4797-8712-a3db7cfe31e0,2344.0,"[115.4, 36.5]",80.0,,,2.0,...,,,,,,,,,,
83926,2023,3923881.0,0.314267,bded325e-4b96-41d1-992d-3942344b5902,2483.0,"[102.9, 46.8]",89.0,,,2.0,...,,,,,,,,,,5.0
83927,2023,3923881.0,1.510618,3e369ccb-5ae2-45cf-8eb8-f24aec5e9e9d,2623.0,"[97.4, 55.3]",93.0,,True,2.0,...,,,,,,,,,,1.0


In [13]:
# Now, we'll extract relevant attributes from the all_shots dataframe
data = pd.DataFrame()

# general and time attributes
data[["period", "minute", "second", "possession", "duration", "competition_id", "season_id", "match_id", "timestamp", "team", "player", "freeze_frame"]] = all_shots[["period", "minute", "second", "possession", "duration", "competition_id", "season_id", "match_id", "timestamp", "team", "player", "shot_freeze_frame"]]
data.sort_values(by=["match_id", "period", "minute", "second"], inplace=True)

In [14]:
# qualitative attributes
data[["play_pattern", "position"]] = all_shots[["play_pattern", "position"]]
data["player_type"] = all_shots["position"].apply(lambda x: 4 if ("Forward" in x or "Striker" in x) else 3 if "Wing" in x else 2 if "Mid" in x else 1 if ("Back" in x or "Defen" in x) else 0)

In [15]:
# shot attributes
data["location_x"] = all_shots["location"].apply(lambda x: ast.literal_eval(x)[0])
data["location_x_distance"] = 120 - data["location_x"]
data["location_y"] = all_shots["location"].apply(lambda x: ast.literal_eval(x)[1])
data["location_y_distance"] = abs(data["location_y"] - 40)
data["duration"] = all_shots["duration"].apply(lambda x: x if x < 100 else 0)
data["technique"] = all_shots["shot_technique"]
data["body_part"] = all_shots["shot_body_part"]
data["type"] = all_shots["shot_type"]
data["is_penalty"] = all_shots["shot_type"] == "Penalty"
data["is_header"] = all_shots["shot_body_part"] == "Head"

In [16]:
# shot modifiers
data["first_time"] = all_shots["shot_first_time"].fillna(False)
data["open_goal"] = all_shots["shot_open_goal"].fillna(False)
data["one_on_one"] = all_shots["shot_one_on_one"].fillna(False)
data["aerial_won"] = all_shots["shot_aerial_won"].fillna(False)
data["follows_dribble"] = all_shots["shot_follows_dribble"].fillna(False)
data["under_pressure"] = all_shots["under_pressure"].fillna(False)

In [17]:
# preceding pass attributes
data["pass_duration"] = all_shots["pass_duration"].fillna(all_shots["pass_duration"].mean())
data["pass_angle"] = all_shots["pass_angle"].fillna(all_shots["pass_angle"].mean())
data["pass_type"] = all_shots["pass_type"].fillna("Missing")
data["pass_height"] = all_shots["pass_height"].fillna("Missing")
data["pass_length"] = all_shots["pass_length"].fillna(all_shots["pass_length"].mean())
data["num_passes"] = all_shots["num_passes"].fillna(0)

In [18]:
# locations for defensive/goalkeeper attributes
locations = pd.DataFrame()
locations["opponents"] = all_shots["shot_freeze_frame"].fillna('[{"location": [], "position": {"name": ""}, "teammate": False}]').apply(lambda frame: [[player["location"], player["position"]["name"]] for player in ast.literal_eval(frame) if not player["teammate"]])
locations["shot"] = tuple(zip(data["location_x"].apply(lambda x: 119.9 if x == 120.0 else x), data["location_y"]))
locations["goalkeeper"] = locations["opponents"].apply(lambda frame: [player[0] for player in frame if player[1] == "Goalkeeper"]).apply(lambda x: x[0] if len(x) != 0 else [-1, -1]).apply(lambda x: 119.9 if x == 120.0 else x)
locations

Unnamed: 0,opponents,shot,goalkeeper
0,"[[[112.0, 28.0], Right Back], [[103.0, 50.0], ...","(115.0, 25.0)","[120.0, 26.0]"
1,"[[[106.0, 42.0], Right Center Midfield], [[115...","(109.0, 51.0)","[119.0, 40.0]"
2,"[[[111.0, 44.0], Left Center Back], [[109.0, 3...","(99.0, 52.0)","[119.0, 40.0]"
3,"[[[101.0, 37.0], Right Center Midfield], [[119...","(107.0, 40.0)","[119.0, 38.0]"
4,"[[[114.0, 46.0], Left Center Back], [[107.0, 3...","(108.0, 32.0)","[119.0, 38.0]"
...,...,...,...
83924,"[[[117.8, 51.0], Left Wing Back], [[108.3, 55....","(109.9, 41.3)","[118.6, 40.1]"
83925,"[[[106.5, 47.9], Left Wing], [[106.3, 38.1], L...","(115.4, 36.5)","[118.2, 36.5]"
83926,"[[[96.3, 37.3], Right Wing], [[106.0, 44.9], R...","(102.9, 46.8)","[116.9, 41.4]"
83927,"[[[117.6, 41.3], Goalkeeper], [[95.6, 56.9], C...","(97.4, 55.3)","[117.6, 41.3]"


In [19]:
# Check if a point is inside a triangle
def point_in_triangle(point, a, b, c):
    as_x = point[0] - a[0]
    as_y = point[1] - a[1]
    s_ab = ((b[0] - a[0]) * as_y) - ((b[1] - a[1]) * as_x) > 0

    if (((c[0] - a[0]) * as_y) - ((c[1] - a[1]) * as_x) > 0) == s_ab \
            or (((c[0] - b[0]) * (point[1] - b[1])) - ((c[1] - b[1]) * (point[0] - b[0])) > 0) != s_ab:
        return False
    else:
        return True

In [20]:
# Calculate the area of a triangle
def area_of_triangle(a, b, c):
    return abs(0.5 * (a[0]*(b[1] - c[1]) + b[0]*(c[1] - a[1]) + c[0]*(a[1] - b[1])))

In [21]:
# Calculate the angle between two sides of the triangle
def angle_between_sides(a, b, c):
    cos_theta = (a**2 + b**2 - c**2) / (2 * a * b)
    theta = math.degrees(math.acos(cos_theta))
    return theta

In [22]:
# What is the best (shortest) straight line path from shooter to goal?
def best_distance(point) :

    line = (0, 8)
    left_line = (point[0] - 120, point[1] - 36)
    right_line = (point[0] - 120, point[1] - 44)

    line_dot_left = line[0] * left_line[0] + line[1] * left_line[1]
    line_dot_right = line[0] * right_line[0] + line[1] * right_line[1]

    if line_dot_right > 0:
        x = 120 - point[0]
        y = 44 - point[1]
        return math.sqrt(x * x + y * y)
    elif line_dot_left < 0:
        x = 120 - point[0]
        y = 36 - point[1]
        return math.sqrt(x * x + y * y)
    else:
        return 120 - point[0]

In [23]:
# Calculate the perpendicular line 1.5m on either side of a line between two points
def calculate_perpendicular(a, b):

    slope = (b[1] - a[1])/(b[0] - a[0])
    slope = 0.00001 if slope == 0 else slope
    perpendicular_slope = -1/slope

    offset_x = 1.5 / np.sqrt(1 + perpendicular_slope**2)
    offset_y = perpendicular_slope * offset_x
    point1 = (a[0] + offset_x, a[1] + offset_y)
    point2 = (a[0] - offset_x, a[1] - offset_y)

    return point1, point2

In [24]:
# A normal Gaussian
def gaussian(x, y, a, b):
    d2 = (x - a)**2 + (y - b)**2
    return np.exp(-d2 / 2)

In [25]:
# defensive pressure/goalkeeper attributes
data["defenders_3m_radius"] = locations.apply(lambda shot: (len([defender for defender in shot["opponents"] if ((defender[0][0]-shot["shot"][0])**2 + (defender[0][1]-shot["shot"][1])**2) < 3**2])) if shot["opponents"] != [[[], '']] else -1, axis=1)
data["defenders_triangle"] = locations.apply(lambda shot: (len([defender for defender in shot["opponents"] if point_in_triangle(defender[0], shot["shot"], [120, 32], [120, 48])])) if shot["opponents"] != [[[], '']] else -1, axis=1)

data["goalkeeper_x"] = locations["goalkeeper"].apply(lambda x: x[0])
data["goalkeeper_y"] = locations["goalkeeper"].apply(lambda x: x[1])
data["distance_to_goalie"] = data.apply(lambda x: math.sqrt((x["goalkeeper_x"] - x["location_x"])**2 + (x["goalkeeper_y"] - x["location_y"])**2), axis=1)

In [26]:
# angle/location-based attributes
data["shooting_range"] = locations["shot"].apply(lambda point: angle_between_sides(math.dist(point, (120, 36)), math.dist((120, 44), point), 8))
data["goal_distance"] = locations["shot"].apply(lambda point: math.dist(point, (120, 40)))
data["best_distance"] = locations["shot"].apply(lambda point: best_distance(point))

In [27]:
# target variables
data["statsbomb_xg"] = all_shots["shot_statsbomb_xg"].apply(lambda x: float(x))
data["end_location_x"] = all_shots["shot_end_location"].apply(lambda x: ast.literal_eval(x)[0])
data["end_location_y"] = all_shots["shot_end_location"].apply(lambda x: ast.literal_eval(x)[1])
data["is_goal"] = all_shots["shot_outcome"].apply(lambda x: True if x == "Goal" else False)

In [28]:
# The angle at which the ball goes, and whether it was taken from the favoured side of the pitch or not
data["shot_angle"] =  data.apply(lambda x: math.atan2((x["end_location_y"] - x["location_y"]), (x["end_location_x"] - x["location_x"])), axis=1)
data["good_foot"] = data.apply(lambda shot: True if ((shot["body_part"] == "Right Foot" and shot["location_y"] < 42) or (shot["body_part"] == "Left Foot" and shot["location_y"] > 42)) else False, axis=1)

In [29]:
# How many shots and xg there has been so far
data["shots_so_far"] = all_shots.groupby(["match_id", "team"])["timestamp"].cumcount()
data["xg_so_far"] = all_shots.groupby(["match_id", "team"])["shot_statsbomb_xg"].cumsum()
data["xg_so_far"] = data["xg_so_far"] - data["statsbomb_xg"]

In [30]:
# We can now calculate the game state and which team was leading at the time of the shot
all_goals = all_shots[all_shots["shot_outcome"] == "Goal"]

first_scorers = all_goals.groupby(['match_id']).first()[["team"]]
all_goals["is_first"] = all_goals[['match_id', 'team']].apply(tuple, axis=1).isin(list(zip(first_scorers.index, first_scorers["team"])))

all_goals["first_tally_temp"] = all_goals[all_goals["is_first"] == True].groupby(["match_id", "is_first"]).cumcount()
all_goals["first_tally"] = all_goals.groupby(["match_id"])["first_tally_temp"].fillna(method="ffill")
all_goals.loc[all_goals['first_tally_temp'].isnull() & ~all_goals['first_tally'].isnull(), 'first_tally'] = all_goals.loc[all_goals['first_tally_temp'].isnull() & ~all_goals['first_tally'].isnull(), 'first_tally'] + 1

all_goals["second_tally_temp"] = all_goals[all_goals["is_first"] == False].groupby(["match_id", "is_first"]).cumcount()
all_goals["second_tally"] = all_goals.groupby(["match_id"])["second_tally_temp"].fillna(method="ffill")
all_goals.loc[all_goals['second_tally_temp'].isnull() & ~all_goals['second_tally'].isnull(), 'second_tally'] = all_goals.loc[all_goals['second_tally_temp'].isnull() & ~all_goals['second_tally'].isnull(), 'second_tally'] + 1
all_goals["second_tally"].fillna(0, inplace=True)

all_goals[["season_id", "match_id", "period", "minute", "second", "team", "is_first", "first_tally", "second_tally"]]

Unnamed: 0,season_id,match_id,period,minute,second,team,is_first,first_tally,second_tally
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,0.0,0.0
7,2018/2019,7298.0,1.0,23.0,21.0,Chelsea FCW,True,1.0,0.0
18,2018/2019,7298.0,2.0,48.0,41.0,Manchester City WFC,False,2.0,0.0
29,2018/2019,7298.0,2.0,85.0,34.0,Manchester City WFC,False,2.0,1.0
34,2018,7430.0,1.0,2.0,42.0,Washington Spirit,True,0.0,0.0
...,...,...,...,...,...,...,...,...,...
83903,2023,3923880.0,5.0,130.0,33.0,Congo DR,True,4.0,5.0
83904,2023,3923880.0,5.0,131.0,20.0,South Africa,False,5.0,5.0
83915,2023,3923881.0,1.0,37.0,1.0,Nigeria,True,0.0,0.0
83922,2023,3923881.0,2.0,61.0,40.0,Côte d'Ivoire,False,1.0,0.0


In [31]:
data["first_tally"] = all_goals["first_tally"]
data["second_tally"] = all_goals["second_tally"]
data[["season_id", "match_id", "period", "minute", "second", "team", "is_goal", "shots_so_far", "xg_so_far", "first_tally", "second_tally"]]

Unnamed: 0,season_id,match_id,period,minute,second,team,is_goal,shots_so_far,xg_so_far,first_tally,second_tally
0,2018/2019,7298.0,1.0,0.0,47.0,Chelsea FCW,False,0,0.000000,,
1,2018/2019,7298.0,1.0,5.0,12.0,Chelsea FCW,False,1,0.018856,,
2,2018/2019,7298.0,1.0,5.0,41.0,Chelsea FCW,False,2,0.060703,,
3,2018/2019,7298.0,1.0,5.0,43.0,Chelsea FCW,False,3,0.087385,,
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,4,0.193604,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,2.0,73.0,55.0,Côte d'Ivoire,False,15,1.082065,,
83925,2023,3923881.0,2.0,80.0,14.0,Côte d'Ivoire,True,16,1.157621,1.0,1.0
83926,2023,3923881.0,2.0,89.0,12.0,Nigeria,False,3,0.190122,,
83927,2023,3923881.0,2.0,93.0,55.0,Côte d'Ivoire,False,17,1.390592,,


In [32]:
data["is_first"] = data[['match_id', 'team']].apply(tuple, axis=1).isin(list(zip(first_scorers.index, first_scorers["team"])))
data["first_tally"] = data["first_tally"].fillna(method="bfill")
data["second_tally"] = data["second_tally"].fillna(method="bfill")

data["row_index"] = data.index
last_goals = data[data["is_goal"] == True].groupby(["match_id"]).last()["row_index"]
last_shots = data.groupby(["match_id"]).last()["row_index"]
last_shots = last_shots[last_shots.index.isin(last_goals.index)]

for i in range(len(last_goals)):
    if last_goals.iloc[i] != last_shots.iloc[i]:
        goal = data.iloc[last_goals.iloc[i]]
        prop_values = (goal["first_tally"]+1, goal["second_tally"]) if goal["is_first"] else (goal["first_tally"], goal["second_tally"]+1)
        data.loc[last_goals.iloc[i]+1 : last_shots.iloc[i], "first_tally"] = prop_values[0]
        data.loc[last_goals.iloc[i]+1 : last_shots.iloc[i], "second_tally"] = prop_values[1]

data[["season_id", "match_id", "period", "minute", "second", "team", "is_goal", "shots_so_far", "xg_so_far", "first_tally", "second_tally"]]

Unnamed: 0,season_id,match_id,period,minute,second,team,is_goal,shots_so_far,xg_so_far,first_tally,second_tally
0,2018/2019,7298.0,1.0,0.0,47.0,Chelsea FCW,False,0,0.000000,0.0,0.0
1,2018/2019,7298.0,1.0,5.0,12.0,Chelsea FCW,False,1,0.018856,0.0,0.0
2,2018/2019,7298.0,1.0,5.0,41.0,Chelsea FCW,False,2,0.060703,0.0,0.0
3,2018/2019,7298.0,1.0,5.0,43.0,Chelsea FCW,False,3,0.087385,0.0,0.0
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,4,0.193604,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,2.0,73.0,55.0,Côte d'Ivoire,False,15,1.082065,1.0,1.0
83925,2023,3923881.0,2.0,80.0,14.0,Côte d'Ivoire,True,16,1.157621,1.0,1.0
83926,2023,3923881.0,2.0,89.0,12.0,Nigeria,False,3,0.190122,1.0,2.0
83927,2023,3923881.0,2.0,93.0,55.0,Côte d'Ivoire,False,17,1.390592,1.0,2.0


In [33]:
data["game_state"] = data.apply(lambda x: (x["first_tally"] - x["second_tally"]) if x["is_first"] else (x["second_tally"] - x["first_tally"]) ,axis=1)
data["was_leading"] = data["game_state"].apply(lambda x: 1 if x > 0 else 0 if x == 0 else -1)

data.drop(["row_index", "is_first", "first_tally", "second_tally"], axis=1, inplace=True)

data[["season_id", "match_id", "period", "minute", "second", "team", "is_goal", "shots_so_far", "xg_so_far", "game_state", "was_leading"]]

Unnamed: 0,season_id,match_id,period,minute,second,team,is_goal,shots_so_far,xg_so_far,game_state,was_leading
0,2018/2019,7298.0,1.0,0.0,47.0,Chelsea FCW,False,0,0.000000,0.0,0
1,2018/2019,7298.0,1.0,5.0,12.0,Chelsea FCW,False,1,0.018856,0.0,0
2,2018/2019,7298.0,1.0,5.0,41.0,Chelsea FCW,False,2,0.060703,0.0,0
3,2018/2019,7298.0,1.0,5.0,43.0,Chelsea FCW,False,3,0.087385,0.0,0
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,4,0.193604,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,2.0,73.0,55.0,Côte d'Ivoire,False,15,1.082065,0.0,0
83925,2023,3923881.0,2.0,80.0,14.0,Côte d'Ivoire,True,16,1.157621,0.0,0
83926,2023,3923881.0,2.0,89.0,12.0,Nigeria,False,3,0.190122,-1.0,-1
83927,2023,3923881.0,2.0,93.0,55.0,Côte d'Ivoire,False,17,1.390592,1.0,1


In [34]:
# To capture phases of play, we'll count the number of shots, and the number of shots by the same team, over the last minute and 15 minutes
data["time"] = data.apply(lambda x: x["timestamp"].replace("00:", "01:", 1) if x["period"] == 2.0 else x["timestamp"], axis=1)
data["time"] = pd.to_datetime(data["time"])

data.sort_values(by=["match_id", "time"], inplace=True)
data["past_minute"] = data.groupby(["match_id"]).apply(lambda match: match
                                           .set_index('time').sort_index()
                                           .rolling('60s')
                                           .agg({'team': 'count'}).reset_index(drop=True)).reset_index(drop=True)
data["past_15"] = data.groupby(["match_id"]).apply(lambda match: match
                                           .set_index('time').sort_index()
                                           .rolling('900s')
                                           .agg({'team': 'count'}).reset_index(drop=True)).reset_index(drop=True)
data[["season_id", "match_id", "period", "minute", "second", "team", "is_goal", "past_minute", "past_15"]].sort_values(by=["match_id", "period", "minute", "second"])

Unnamed: 0,season_id,match_id,period,minute,second,team,is_goal,past_minute,past_15
0,2018/2019,7298.0,1.0,0.0,47.0,Chelsea FCW,False,1.0,1.0
1,2018/2019,7298.0,1.0,5.0,12.0,Chelsea FCW,False,1.0,2.0
2,2018/2019,7298.0,1.0,5.0,41.0,Chelsea FCW,False,2.0,3.0
3,2018/2019,7298.0,1.0,5.0,43.0,Chelsea FCW,False,3.0,4.0
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,4.0,5.0
...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,2.0,73.0,55.0,Côte d'Ivoire,False,1.0,5.0
83925,2023,3923881.0,2.0,80.0,14.0,Côte d'Ivoire,True,1.0,3.0
83926,2023,3923881.0,2.0,89.0,12.0,Nigeria,False,1.0,2.0
83927,2023,3923881.0,2.0,93.0,55.0,Côte d'Ivoire,False,1.0,3.0


In [35]:
data.sort_values(by=["match_id", "team", "time"], inplace=True)
data["own_past_minute"] = data.groupby(["match_id", "team"]).apply(lambda match: match
                                           .set_index('time').sort_index()
                                           .rolling('60s')
                                           .agg({'team': 'count'}).reset_index(drop=True)).reset_index(drop=True)
data["own_past_15"] = data.groupby(["match_id", "team"]).apply(lambda match: match
                                           .set_index('time').sort_index()
                                           .rolling('900s')
                                           .agg({'team': 'count'}).reset_index(drop=True)).reset_index(drop=True)

data.drop(["time"], axis=1, inplace=True)
data.sort_values(by=["match_id", "period", "minute", "second"], inplace=True)
data[["season_id", "match_id", "period", "minute", "second", "team", "is_goal", "own_past_minute", "own_past_15"]]

Unnamed: 0,season_id,match_id,period,minute,second,team,is_goal,own_past_minute,own_past_15
0,2018/2019,7298.0,1.0,0.0,47.0,Chelsea FCW,False,1.0,1.0
1,2018/2019,7298.0,1.0,5.0,12.0,Chelsea FCW,False,1.0,2.0
2,2018/2019,7298.0,1.0,5.0,41.0,Chelsea FCW,False,2.0,3.0
3,2018/2019,7298.0,1.0,5.0,43.0,Chelsea FCW,False,3.0,4.0
4,2018/2019,7298.0,1.0,5.0,46.0,Chelsea FCW,True,4.0,5.0
...,...,...,...,...,...,...,...,...,...
83924,2023,3923881.0,2.0,73.0,55.0,Côte d'Ivoire,False,1.0,1.0
83925,2023,3923881.0,2.0,80.0,14.0,Côte d'Ivoire,True,2.0,2.0
83926,2023,3923881.0,2.0,89.0,12.0,Nigeria,False,1.0,1.0
83927,2023,3923881.0,2.0,93.0,55.0,Côte d'Ivoire,False,1.0,1.0


In [36]:
data["is_extra_time"] = data["minute"] >= 90

In [37]:
data

Unnamed: 0,period,minute,second,possession,duration,competition_id,season_id,match_id,timestamp,team,...,good_foot,shots_so_far,xg_so_far,game_state,was_leading,past_minute,past_15,own_past_minute,own_past_15,is_extra_time
0,1.0,0.0,47.0,4.0,0.560000,FA Women's Super League,2018/2019,7298.0,00:00:47.620,Chelsea FCW,...,True,0,0.000000,0.0,0,1.0,1.0,1.0,1.0,False
1,1.0,5.0,12.0,15.0,0.400000,FA Women's Super League,2018/2019,7298.0,00:05:12.780,Chelsea FCW,...,True,1,0.018856,0.0,0,1.0,2.0,1.0,2.0,False
2,1.0,5.0,41.0,16.0,0.480000,FA Women's Super League,2018/2019,7298.0,00:05:41.940,Chelsea FCW,...,False,2,0.060703,0.0,0,2.0,3.0,2.0,3.0,False
3,1.0,5.0,43.0,16.0,0.160000,FA Women's Super League,2018/2019,7298.0,00:05:43.900,Chelsea FCW,...,True,3,0.087385,0.0,0,3.0,4.0,3.0,4.0,False
4,1.0,5.0,46.0,16.0,1.480000,FA Women's Super League,2018/2019,7298.0,00:05:46.380,Chelsea FCW,...,False,4,0.193604,0.0,0,4.0,5.0,4.0,5.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83924,2.0,73.0,55.0,137.0,0.921817,African Cup of Nations,2023,3923881.0,00:28:55.274,Côte d'Ivoire,...,True,15,1.082065,0.0,0,1.0,5.0,1.0,1.0,False
83925,2.0,80.0,14.0,140.0,0.668012,African Cup of Nations,2023,3923881.0,00:35:14.269,Côte d'Ivoire,...,True,16,1.157621,0.0,0,1.0,3.0,2.0,2.0,False
83926,2.0,89.0,12.0,149.0,0.314267,African Cup of Nations,2023,3923881.0,00:44:12.865,Nigeria,...,True,3,0.190122,-1.0,-1,1.0,2.0,1.0,1.0,False
83927,2.0,93.0,55.0,159.0,1.510618,African Cup of Nations,2023,3923881.0,00:48:55.619,Côte d'Ivoire,...,True,17,1.390592,1.0,1,1.0,3.0,1.0,1.0,True


In [38]:
data.nunique()

period                     5
minute                   139
second                    60
possession               290
duration               77925
competition_id            17
season_id                 42
match_id                3312
timestamp              82726
team                     288
player                  5638
freeze_frame           82866
play_pattern               9
position                  25
player_type                5
location_x               635
location_x_distance      635
location_y               701
location_y_distance      498
technique                  7
body_part                  4
type                       5
is_penalty                 2
is_header                  2
first_time                 2
open_goal                  2
one_on_one                 2
aerial_won                 2
follows_dribble            2
under_pressure             2
pass_duration          11897
pass_angle             10504
pass_type                  8
pass_height                4
pass_length   

In [39]:
data.to_csv("./data/augmented_data.csv", index=False)