In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import os

In [2]:
# constants

PATH: str = "../data/spadl_format/"
LEAGUES: list[str] = ["England", "Spain", "France", "Italy", "Germany"]

teams_df = pd.read_json("../data/wyscout/teams/teams.json")
players_df = pd.read_json("../data/wyscout/players/players.json")

df_dict = {}
if not os.path.exists("../data/processed/"):
    os.mkdir("../data/processed")
for league in LEAGUES:

    df = pd.read_csv(f"{PATH}{league}.csv", index_col=0)

    # remove not used columns
    df.drop(["original_event_id", "result_id", "period_id", "game_id","bodypart_id","type_id"], inplace=True, axis=1)

    df_dict[league] = df
all_df = pd.concat([df for df in df_dict.values()])
all_df = all_df[all_df["type_name"] == "shot"]
all_df

Unnamed: 0,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,action_id,type_name,result_name,bodypart_name,player_name
34,94.595788,1609,25413,92.40,40.12,105.00,37.40,34,shot,success,foot_right,A. Lacazette
39,179.854785,1631,26150,89.25,32.64,105.00,40.80,39,shot,fail,foot_left,R. Mahrez
58,254.745027,1631,14763,100.80,32.64,105.00,34.00,58,shot,success,head/other,S. Okazaki
84,425.824035,1609,7868,85.05,45.56,105.00,40.80,84,shot,fail,foot_left,A. Oxlade-Chamberlain
176,815.462015,1609,7868,78.75,47.60,105.00,37.40,176,shot,fail,foot_right,A. Oxlade-Chamberlain
...,...,...,...,...,...,...,...,...,...,...,...,...
388771,920.241146,2451,15191,90.30,25.16,105.00,30.60,773,shot,fail,foot_right,Y. Mallı
388890,1552.626343,2451,14917,95.55,39.44,105.00,37.40,892,shot,success,head/other,R. Knoche
388927,1711.659947,2451,134708,73.50,42.16,105.00,34.00,929,shot,fail,foot_right,D. Origi
389132,2725.373600,2451,284469,95.55,47.60,105.00,30.60,1134,shot,success,foot_right,J. Brekalo


In [3]:
GOAL_CENTER_X: int = 105
GOAL_CENTER_Y: int = 34

UPPER_CROSSBAR_X: int = 105
UPPER_CROSSBAR_Y: int = 38

LOWER_CROSSBAR_X: int = 105
LOWER_CROSSBAR_Y: int = 30

In [4]:
# new columns will be created in the feature extraction process

# shot distance from goal
all_df["shot_distance_from_goal"] = all_df.apply(lambda x: sqrt((x["start_x"] - GOAL_CENTER_X)**2 + (x["start_y"] - GOAL_CENTER_Y)**2), axis=1)

# shot angle from the goal
def get_shot_angle(shot_pos_x: np.float_, shot_pos_y: np.float_) -> np.float_:
    v1: np.array = np.array([UPPER_CROSSBAR_X - shot_pos_x, UPPER_CROSSBAR_Y - shot_pos_y])
    v2: np.array = np.array([LOWER_CROSSBAR_X - shot_pos_x, LOWER_CROSSBAR_Y - shot_pos_y])
    return np.arccos(np.dot(v1 / np.linalg.norm(v1), v2 / np.linalg.norm(v2)))
all_df["shot_angle_form_the_goal"] = all_df[["start_x", "start_y"]].apply(lambda pos: get_shot_angle(pos["start_x"], pos["start_y"]), axis=1)


all_df.head()

Unnamed: 0,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,action_id,type_name,result_name,bodypart_name,player_name,shot_distance_from_goal,shot_angle_form_the_goal
34,94.595788,1609,25413,92.4,40.12,105.0,37.4,34,shot,success,foot_right,A. Lacazette,14.007655,0.509981
39,179.854785,1631,26150,89.25,32.64,105.0,40.8,39,shot,fail,foot_left,R. Mahrez,15.808608,0.494098
58,254.745027,1631,14763,100.8,32.64,105.0,34.0,58,shot,success,head/other,S. Okazaki,4.414703,1.46731
84,425.824035,1609,7868,85.05,45.56,105.0,40.8,84,shot,fail,foot_left,A. Oxlade-Chamberlain,23.057235,0.300168
176,815.462015,1609,7868,78.75,47.6,105.0,37.4,176,shot,fail,foot_right,A. Oxlade-Chamberlain,29.563872,0.24003


In [6]:
all_df.to_csv("../data/processed/df.csv", index=None)