In [None]:
import pandas as pd

df = pd.read_csv("data/psg_ligue1_2015_2016_events.csv")

shots_df = df[df["type"] == "Shot"].copy()
shots_df["goal"] = shots_df["shot_outcome"].apply(lambda x: 1 if x == "Goal" else 0)

features = [
    "x", "y",
    "shot_body_part",
    "shot_technique",
    "under_pressure",
    "shot_first_time",
    "goal",  # target
    "shot_statsbomb_xg",  # for benchmark
    "team", "player", "minute", "second"
]

shots_df = shots_df[features]

# Cleaning and convert types of missing values
shots_df["under_pressure"] = shots_df["under_pressure"].fillna(False).astype(int)
shots_df["shot_first_time"] = shots_df["shot_first_time"].fillna(False).astype(int)

shots_df.to_csv("data/shots_cleaned.csv", index=False)

print(f"{len(shots_df)} shots extracted and saved to 'data/shots_cleaned.csv'")


554 shots extracted and saved to 'data/shots_cleaned.csv'


  df = pd.read_csv("data/psg_ligue1_2015_2016_events.csv")
  shots_df["under_pressure"] = shots_df["under_pressure"].fillna(False).astype(int)
  shots_df["shot_first_time"] = shots_df["shot_first_time"].fillna(False).astype(int)


In [None]:
from statsbombpy import sb
import pandas as pd
from tqdm import tqdm

competition_id = 7   # Ligue 1
season_id = 27       # 2015/2016

# Load all 2015/2016 Ligue 1 matches
matches = sb.matches(competition_id=competition_id, season_id=season_id)

# Filtrer on PSG matches
psg_matches = matches[
    (matches["home_team"].str.contains("Paris", case=False, na=False)) |
    (matches["away_team"].str.contains("Paris", case=False, na=False))
]

# Extract events for PSG matches
all_psg_events = []

for match_id in tqdm(psg_matches["match_id"]):
    try:
        events = sb.events(match_id=match_id)
        psg_events = events[events["team"].str.contains("Paris", case=False, na=False)].copy()
        psg_events["match_id"] = match_id

        if "location" in psg_events.columns:
            psg_events[["x", "y"]] = psg_events["location"].apply(pd.Series)

        if "pass_end_location" in psg_events.columns:
            psg_events[["pass_end_x", "pass_end_y"]] = psg_events["pass_end_location"].apply(pd.Series)

        if "carry_end_location" in psg_events.columns:
            psg_events[["carry_end_x", "carry_end_y"]] = psg_events["carry_end_location"].apply(pd.Series)

        all_psg_events.append(psg_events)

    except Exception as e:
        print(f"Erreur sur le match {match_id} : {e}")
        continue

if all_psg_events:
    df_psg_all = pd.concat(all_psg_events).reset_index(drop=True)
    df_psg_all.to_csv("psg_ligue1_2015_2016_events.csv", index=False)
    print(f" {len(df_psg_all)} événements PSG extraits avec succès.")
else:
    print(" Aucun événement PSG trouvé.")


100%|██████████| 37/37 [00:21<00:00,  1.71it/s]


✅ 93416 événements PSG extraits avec succès.


### Add of new feature useful for the model training


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/PSG_Marseille_shots_cleaned.csv")

# (center = [120, 40])
df["distance_to_goal"] = np.sqrt((120 - df["x"])**2 + (40 - df["y"])**2)

# Angle to goal (between the posts)
def calculate_angle(x, y):
    goal_width = 7.32  # largeur du but en mètres
    a = goal_width / 2
    dx = 120 - x
    dy = np.abs(y - 40)
    return np.arctan2(a, dx) - np.arctan2(-a, dx)

df["angle_to_goal"] = df.apply(lambda row: calculate_angle(row["x"], row["y"]), axis=1)

# area of the shot
df["is_central_zone"] = df["y"].between(30, 50).astype(int)
df["is_left_side"] = (df["y"] < 40).astype(int)
df["is_right_side"] = (df["y"] > 40).astype(int)

df.to_csv("data/psg_marseille_shots_enriched.csv", index=False)
print("Fichier enrichi sauvegardé dans 'psg_shots_enriched.csv'")


✅ Fichier enrichi sauvegardé dans 'psg_shots_enriched.csv'
