In [10]:
!pip install pandas numpy scikit-learn matplotlib mplsoccer statsbombpy



In [11]:
import numpy as np
import pandas as pd

from statsbombpy import sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.calibration import calibration_curve

import matplotlib.pyplot as plt
from mplsoccer import Pitch


In [12]:
comps = sb.competitions()  # all open competitions/seasons available
comps.columns

Index(['competition_id', 'season_id', 'country_name', 'competition_name',
       'competition_gender', 'competition_youth', 'competition_international',
       'season_name', 'match_updated', 'match_updated_360',
       'match_available_360', 'match_available'],
      dtype='object')

In [13]:
comps.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,9,281,Germany,1. Bundesliga,male,False,False,2023/2024,2024-09-28T20:46:38.893391,2025-07-06T04:26:07.636270,2025-07-06T04:26:07.636270,2024-09-28T20:46:38.893391
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
2,1267,107,Africa,African Cup of Nations,male,False,True,2023,2024-09-28T01:57:35.846538,,,2024-09-28T01:57:35.846538
3,16,4,Europe,Champions League,male,False,False,2018/2019,2025-05-08T15:10:50.835274,2021-06-13T16:17:31.694,,2025-05-08T15:10:50.835274
4,16,1,Europe,Champions League,male,False,False,2017/2018,2024-02-13T02:35:28.134882,2021-06-13T16:17:31.694,,2024-02-13T02:35:28.134882


In [14]:
laliga = comps[comps['competition_name'].str.contains('La Liga', case=False, na=False)]
laliga[['competition_id', 'season_id', 'season_name']].drop_duplicates().head(10)

Unnamed: 0,competition_id,season_id,season_name
38,11,90,2020/2021
39,11,42,2019/2020
40,11,4,2018/2019
41,11,1,2017/2018
42,11,2,2016/2017
43,11,27,2015/2016
44,11,26,2014/2015
45,11,25,2013/2014
46,11,24,2012/2013
47,11,23,2011/2012


In [15]:
season_match_counts = []
for _, row in laliga.drop_duplicates(subset=['competition_id', 'season_id']).iterrows():
    matches = sb.matches(competition_id=row['competition_id'], season_id=row['season_id'])
    season_match_counts.append({
        "competition_id": row["competition_id"],
        "season_id": row["season_id"],
        "season_name": row["season_name"],
        "n_matches": len(matches)
    })

season_match_counts = pd.DataFrame(season_match_counts).sort_values("n_matches", ascending=False)
season_match_counts.head(10)

Unnamed: 0,competition_id,season_id,season_name,n_matches
5,11,27,2015/2016,380
6,11,26,2014/2015,38
9,11,23,2011/2012,37
3,11,1,2017/2018,36
11,11,21,2009/2010,35
0,11,90,2020/2021,35
2,11,4,2018/2019,34
4,11,2,2016/2017,34
1,11,42,2019/2020,33
10,11,22,2010/2011,33


In [16]:
best = season_match_counts.iloc[0]
competition_id = int(best["competition_id"])
season_id = int(best["season_id"])
best

competition_id           11
season_id                27
season_name       2015/2016
n_matches               380
Name: 5, dtype: object

In [17]:
matches = sb.matches(competition_id=competition_id, season_id=season_id)
match_ids = matches['match_id'].tolist()
len(match_ids), match_ids[:3]

(380, [3825848, 3825895, 3825894])

In [18]:
events_list = []
for mid in match_ids:
    ev = sb.events(match_id=mid)
    ev['match_id'] = mid
    events_list.append(ev)

events = pd.concat(events_list, ignore_index=True)
events.shape

(1295354, 117)

In [19]:
shots = events[events['type'] == 'Shot'].copy()
shots.shape

(9168, 117)

In [20]:
shots['is_goal'] = (shots['shot_outcome'] == 'Goal').astype(int)
shots['shot_outcome'].value_counts().head()

shot_outcome
Off T      2981
Saved      2216
Blocked    2081
Goal       1014
Wayward     599
Name: count, dtype: int64

In [21]:
# StatsBomb data includes shot_type like 'Penalty'
shots = shots[shots['shot_type'] != 'Penalty'].copy()
shots.shape

(9071, 118)

# Feature Engineering: Distance and Angle

In [24]:
# location is usually like [x, y]
shots['x'] = shots['location'].apply(lambda v: v[0] if isinstance(v, (list, tuple)) else np.nan)
shots['y'] = shots['location'].apply(lambda v: v[1] if isinstance(v, (list, tuple)) else np.nan)
shots = shots.dropna(subset=['x', 'y']).copy()

In [25]:
GOAL_X = 120
GOAL_Y = 40

shots['distance'] = np.sqrt((GOAL_X - shots['x'])**2 + (GOAL_Y - shots['y'])**2)

In [26]:
LEFT_POST_Y = 36
RIGHT_POST_Y = 44

# vectors from shot to each post
dx = GOAL_X - shots['x']
dy_left = LEFT_POST_Y - shots['y']
dy_right = RIGHT_POST_Y - shots['y']

angle = np.abs(np.arctan2(dy_right, dx) - np.arctan2(dy_left, dx))
shots['angle'] = angle

In [27]:
shots[['x','y','distance','angle','is_goal']].describe()

Unnamed: 0,x,y,distance,angle,is_goal
count,9071.0,9071.0,9071.0,9071.0,9071.0
mean,103.763764,39.670136,19.121925,0.440915,0.104178
std,8.705619,10.092545,8.701275,0.274636,0.305508
min,50.3,0.7,0.632456,0.00523,0.0
25%,97.5,32.1,12.138781,0.263102,0.0
50%,105.2,39.7,18.388312,0.341197,0.0
75%,110.7,47.0,25.329232,0.539195,0.0
max,120.0,78.8,74.867416,3.141593,1.0
