In [2]:
!pip install pandas numpy scikit-learn matplotlib mplsoccer statsbombpy

Collecting pandas
  Downloading pandas-2.3.3-cp314-cp314-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.4.0-cp314-cp314-win_amd64.whl.metadata (6.6 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp314-cp314-win_amd64.whl.metadata (11 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp314-cp314-win_amd64.whl.metadata (52 kB)
Collecting mplsoccer
  Using cached mplsoccer-1.6.1-py3-none-any.whl.metadata (4.8 kB)
Collecting statsbombpy
  Using cached statsbombpy-1.16.0-py3-none-any.whl.metadata (63 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp314-cp314-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpo

In [3]:
import numpy as np
import pandas as pd

from statsbombpy import sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.calibration import calibration_curve

import matplotlib.pyplot as plt
from mplsoccer import Pitch


In [4]:
comps = sb.competitions()  # all open competitions/seasons available
comps.columns

Index(['competition_id', 'season_id', 'country_name', 'competition_name',
       'competition_gender', 'competition_youth', 'competition_international',
       'season_name', 'match_updated', 'match_updated_360',
       'match_available_360', 'match_available'],
      dtype='object')

In [5]:
comps.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,9,281,Germany,1. Bundesliga,male,False,False,2023/2024,2024-09-28T20:46:38.893391,2025-07-06T04:26:07.636270,2025-07-06T04:26:07.636270,2024-09-28T20:46:38.893391
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
2,1267,107,Africa,African Cup of Nations,male,False,True,2023,2024-09-28T01:57:35.846538,,,2024-09-28T01:57:35.846538
3,16,4,Europe,Champions League,male,False,False,2018/2019,2025-05-08T15:10:50.835274,2021-06-13T16:17:31.694,,2025-05-08T15:10:50.835274
4,16,1,Europe,Champions League,male,False,False,2017/2018,2024-02-13T02:35:28.134882,2021-06-13T16:17:31.694,,2024-02-13T02:35:28.134882


In [6]:
laliga = comps[comps['competition_name'].str.contains('La Liga', case=False, na=False)]
laliga[['competition_id', 'season_id', 'season_name']].drop_duplicates().head(10)

Unnamed: 0,competition_id,season_id,season_name
38,11,90,2020/2021
39,11,42,2019/2020
40,11,4,2018/2019
41,11,1,2017/2018
42,11,2,2016/2017
43,11,27,2015/2016
44,11,26,2014/2015
45,11,25,2013/2014
46,11,24,2012/2013
47,11,23,2011/2012


In [18]:
season_match_counts = []
for _, row in laliga.drop_duplicates(subset=['competition_id', 'season_id']).iterrows():
    matches = sb.matches(competition_id=row['competition_id'], season_id=row['season_id'])
    season_match_counts.append({
        "competition_id": row["competition_id"],
        "season_id": row["season_id"],
        "season_name": row["season_name"],
        "n_matches": len(matches)
    })

season_match_counts = pd.DataFrame(season_match_counts).sort_values("n_matches", ascending=False)
season_match_counts.head(10)

Unnamed: 0,competition_id,season_id,season_name,n_matches
5,11,27,2015/2016,380
6,11,26,2014/2015,38
9,11,23,2011/2012,37
3,11,1,2017/2018,36
11,11,21,2009/2010,35
0,11,90,2020/2021,35
2,11,4,2018/2019,34
4,11,2,2016/2017,34
1,11,42,2019/2020,33
10,11,22,2010/2011,33


In [19]:
best = season_match_counts.iloc[0]
competition_id = int(best["competition_id"])
season_id = int(best["season_id"])
best

competition_id           11
season_id                27
season_name       2015/2016
n_matches               380
Name: 5, dtype: object

In [20]:
matches = sb.matches(competition_id=competition_id, season_id=season_id)
match_ids = matches['match_id'].tolist()
len(match_ids), match_ids[:3]

(380, [3825848, 3825895, 3825894])

In [21]:
events_list = []
for mid in match_ids:
    ev = sb.events(match_id=mid)
    ev['match_id'] = mid
    events_list.append(ev)

events = pd.concat(events_list, ignore_index=True)
events.shape