# Pip Installs, Packages, + Load Data

In [2]:
 %pip install tqdm statsbombpy -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
# General Imports
import pandas as pd
from statsbombpy import sb

# Specific imports for this data
from typing import Optional, Iterable
from statsbombpy import sb
from tqdm.auto import tqdm

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# 2020/2021 Women's Super League (Prem for females)
WomenSL = sb.matches(competition_id=37, season_id=90)

# Print cols + head
print(WomenSL.columns)
WomenSL.head()

Index(['match_id', 'match_date', 'kick_off', 'competition', 'season',
       'home_team', 'away_team', 'home_score', 'away_score', 'match_status',
       'match_status_360', 'last_updated', 'last_updated_360', 'match_week',
       'competition_stage', 'stadium', 'referee', 'home_managers',
       'away_managers', 'data_version', 'shot_fidelity_version',
       'xy_fidelity_version'],
      dtype='object')


Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3775648,2021-02-28,15:00:00.000,England - FA Women's Super League,2020/2021,Aston Villa,Arsenal WFC,0,4,available,...,2021-06-13T16:17:31.694,11,Regular Season,Poundland Bescot Stadium,Lisa Benn,Marcus Bignot,Joseph Montemurro,1.1.0,2,2
1,3775609,2021-04-28,20:30:00.000,England - FA Women's Super League,2020/2021,Arsenal WFC,West Ham United LFC,2,0,available,...,2021-06-13T16:17:31.694,13,Regular Season,Meadow Park,Emily Heaslip,Joseph Montemurro,Olli Harder,1.1.0,2,2
2,3775633,2021-02-06,13:30:00.000,England - FA Women's Super League,2020/2021,Aston Villa,Tottenham Hotspur Women,1,0,available,...,2021-06-13T16:17:31.694,14,Regular Season,Poundland Bescot Stadium,Jane Simms,Marcus Bignot,Rehanne Skinner,1.1.0,2,2
3,3775570,2021-03-28,13:30:00.000,England - FA Women's Super League,2020/2021,Brighton & Hove Albion WFC,Everton LFC,0,5,available,...,2021-06-13T16:17:31.694,18,Regular Season,Broadfield Stadium,Christina Hattersley,Hope Patricia Powell,Willie Kirk,1.1.0,2,2
4,3775581,2021-03-28,15:30:00.000,England - FA Women's Super League,2020/2021,Chelsea FCW,Aston Villa,2,0,available,...,2021-06-13T16:17:31.694,18,Regular Season,Kingsmeadow,Sarah Garratt,Emma Hayes,Marcus Bignot,1.1.0,2,2


# Event Data Cleaning

In [None]:
# Black box copilot code 

# Load Event Data for Women's Super League

def build_events_dataset(
    matches_df: pd.DataFrame,
    match_id_col: str = "match_id",
    save_csv: Optional[str] = None,
    keep_columns: Optional[Iterable[str]] = None,
    add_meta: bool = True,
    ) -> pd.DataFrame:

# Fetch events for all match_ids in `matches_df`, concatenate into one DataFrame,
# and (optionally) merge key match metadata columns.

# Parameters
# ----------
# matches_df : DataFrame containing at least `match_id`
# match_id_col : name of the match id column
# save_parquet : path to save a Parquet file (recommended)
# save_csv : path to save a CSV file
# keep_columns : iterable of event columns to keep (None = keep all)
# add_meta : merge match metadata columns (season, teams, date, week, competition)

# Returns
# -------
# DataFrame of all events with `match_id` and optional metadata.

    if match_id_col not in matches_df.columns:
        raise KeyError(f"`{match_id_col}` not in matches_df columns")


    meta_cols = [
    match_id_col,
    "season", "season_id", "season_name",
    "competition", "competition_id", "competition_name",
    "match_date", "match_week",
    "home_team", "away_team"
    ]
    meta = matches_df[[c for c in meta_cols if c in matches_df.columns]].drop_duplicates()

    all_frames = []
    failures = []

    mids = pd.unique(matches_df[match_id_col].astype("int64"))
    for mid in tqdm(mids, desc="Fetching events"):
        try:
            ev = sb.events(match_id=int(mid))
            ev[match_id_col] = int(mid)
            all_frames.append(ev)
        except Exception as e:
            failures.append((int(mid), str(e)))

    if not all_frames:
        raise RuntimeError("No events were loaded. Check your match_ids or environment.")

    events = pd.concat(all_frames, ignore_index=True)


    if keep_columns is not None:
        keep = list(set(keep_columns) | {match_id_col})
        events = events[[c for c in keep if c in events.columns]]


    if add_meta and not meta.empty:
        events = events.merge(meta, on=match_id_col, how="left")


    if save_csv:
        events.to_csv(save_csv, index=False)
        print(f"Saved events: {len(events):,} rows → {save_csv}")

    if failures:
        print(f"\n Skipped {len(failures)} matches due to errors (showing first 5):")
    for mid, msg in failures[:5]:
        print(f" match_id={mid}: {msg}")

    return events



events_df = build_events_dataset(
WomenSL,
keep_columns=None,
add_meta=True
)

Fetching events:   0%|          | 0/131 [00:00<?, ?it/s]

In [43]:
# Inspect Event Data Frame Head
pd.set_option('display.max_columns', None)
print(events_df.shape)

# Sort cols alphabetically
events_df = events_df[sorted(events_df.columns)]
events_df.head()

(443304, 122)


Unnamed: 0,50_50,away_team,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,block_offensive,block_save_block,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,clearance_other,clearance_right_foot,competition,counterpress,dribble_no_touch,dribble_nutmeg,dribble_outcome,dribble_overrun,duel_outcome,duel_type,duration,foul_committed_advantage,foul_committed_card,foul_committed_offensive,foul_committed_penalty,foul_committed_type,foul_won_advantage,foul_won_defensive,foul_won_penalty,goalkeeper_body_part,goalkeeper_end_location,goalkeeper_lost_in_play,goalkeeper_lost_out,goalkeeper_outcome,goalkeeper_position,goalkeeper_punched_out,goalkeeper_saved_to_post,goalkeeper_shot_saved_off_target,goalkeeper_shot_saved_to_post,goalkeeper_success_in_play,goalkeeper_technique,goalkeeper_type,half_start_late_video_start,home_team,id,index,injury_stoppage_in_chain,interception_outcome,location,match_date,match_id,match_week,minute,miscontrol_aerial_won,off_camera,out,pass_aerial_won,pass_angle,pass_assisted_shot_id,pass_body_part,pass_cross,pass_cut_back,pass_deflected,pass_end_location,pass_goal_assist,pass_height,pass_inswinging,pass_length,pass_miscommunication,pass_no_touch,pass_outcome,pass_outswinging,pass_recipient,pass_recipient_id,pass_shot_assist,pass_straight,pass_switch,pass_technique,pass_through_ball,pass_type,period,play_pattern,player,player_id,position,possession,possession_team,possession_team_id,related_events,season,second,shot_aerial_won,shot_body_part,shot_deflected,shot_end_location,shot_first_time,shot_follows_dribble,shot_freeze_frame,shot_key_pass_id,shot_one_on_one,shot_open_goal,shot_outcome,shot_redirect,shot_saved_off_target,shot_saved_to_post,shot_statsbomb_xg,shot_technique,shot_type,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
0,,Arsenal WFC,,,,,,,,,,,,,,,England - FA Women's Super League,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,f51b1630-d1a8-4837-97b0-de862f0e299a,1,,,,2021-02-28,3775648,11,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Aston Villa,2647,,2020/2021,0,,,,,,,,,,,,,,,,,,,,,,"{'formation': 352, 'lineup': [{'player': {'id'...",Aston Villa,2647,00:00:00.000,Starting XI,
1,,Arsenal WFC,,,,,,,,,,,,,,,England - FA Women's Super League,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,1bbead8d-7790-4898-a765-3cdffeaf966e,2,,,,2021-02-28,3775648,11,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Aston Villa,2647,,2020/2021,0,,,,,,,,,,,,,,,,,,,,,,"{'formation': 433, 'lineup': [{'player': {'id'...",Arsenal WFC,968,00:00:00.000,Starting XI,
2,,Arsenal WFC,,,,,,,,,,,,,,,England - FA Women's Super League,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,ab7e55e6-f6fe-4705-a159-942f8d82c04f,3,,,,2021-02-28,3775648,11,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Aston Villa,2647,[f5d78455-7375-46d4-a414-7580f9f8328a],2020/2021,0,,,,,,,,,,,,,,,,,,,,,,,Arsenal WFC,968,00:00:00.000,Half Start,
3,,Arsenal WFC,,,,,,,,,,,,,,,England - FA Women's Super League,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,f5d78455-7375-46d4-a414-7580f9f8328a,4,,,,2021-02-28,3775648,11,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Regular Play,,,,1,Aston Villa,2647,[ab7e55e6-f6fe-4705-a159-942f8d82c04f],2020/2021,0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,2647,00:00:00.000,Half Start,
4,,Arsenal WFC,,,,,,,,,,,,,,,England - FA Women's Super League,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,Aston Villa,d89a38b6-4564-4cb2-a499-b2194c85a604,1900,,,,2021-02-28,3775648,11,45,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,Regular Play,,,,74,Aston Villa,2647,[d77921a3-985c-4b7e-ba25-1ac994eb531c],2020/2021,0,,,,,,,,,,,,,,,,,,,,,,,Arsenal WFC,968,00:00:00.000,Half Start,


In [None]:
# Feature Engineering 
events_df['clearance'] = events_df[['clearance_aerial_won', 'clearance_body_part', 'clearance_head', 'clearance_left_foot', 'clearance_other']].any(axis=1) ## clearances currently have 5 columns, we only need to know if a clearance occurred, not the specifics of it
events_df['block'] = events_df[['block_deflection', 'block_offensive', 'block_save_block']].any(axis=1) ## clearances currently have 5 columns, we only need to know if a clearance occurred, not the specifics of it


# Useless columns for possession retention modeling
cols_to_remove = [
    # Match info – all values are useless
    'season', 'competition', 'match_date', 'match_week', 'home_team', 'away_team', 

    # Substitution specifics – irrelevant, whether or not a sub occurs captured in "type" col
    'substitution_replacement', 'substitution_replacement_id', 'substitution_outcome_id', 'substitution_outcome', 

    # Player info – the name or ID of player involved is irrelevant
    'player', 'player_id', 

    # Event ID – we already have match id and can sort chronologically
    'id', 
    'related_events', # long uninterpretable id for related events

    # Tactics / team formation – irrelevant
    'tactics', 
    'team_id', 'team', # don't need team name and id

    # Duration / timing 
    'duration', 
    'minute','second','timestamp', # all useless since each event is already chronologically captured in index col by game starting at 1
    'possession_team', # name of team with possession irrelevant, we have possession col to indicate change in possession

    # Off-camera – col is useless
    'off_camera', 

    # Clearance info – already made generic clearance col
    'clearance_aerial_won', 'clearance_body_part', 'clearance_head', 'clearance_left_foot', 'clearance_other', 

    # Block info – already made generic block col
    'block_deflection', 'block_offensive', 'block_save_block', 

    # Goalkeeper info
    'goalkeeper_position', 'goalkeeper_end_location', 'goalkeeper_technique'
]

# Remove columns
events_df_clean = events_df.drop(columns=cols_to_remove)

# Sort df alphabetically
events_df_clean = events_df_clean[sorted(events_df_clean.columns)]
pd.set_option('display.max_columns', None)
print(events_df_clean.shape)
events_df_clean.head(5)


(443304, 89)


Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block,carry_end_location,clearance,clearance_right_foot,counterpress,dribble_no_touch,dribble_nutmeg,dribble_outcome,dribble_overrun,duel_outcome,duel_type,foul_committed_advantage,foul_committed_card,foul_committed_offensive,foul_committed_penalty,foul_committed_type,foul_won_advantage,foul_won_defensive,foul_won_penalty,goalkeeper_body_part,goalkeeper_lost_in_play,goalkeeper_lost_out,goalkeeper_outcome,goalkeeper_punched_out,goalkeeper_saved_to_post,goalkeeper_shot_saved_off_target,goalkeeper_shot_saved_to_post,goalkeeper_success_in_play,goalkeeper_type,half_start_late_video_start,index,injury_stoppage_in_chain,interception_outcome,location,match_id,miscontrol_aerial_won,out,pass_aerial_won,pass_angle,pass_assisted_shot_id,pass_body_part,pass_cross,pass_cut_back,pass_deflected,pass_end_location,pass_goal_assist,pass_height,pass_inswinging,pass_length,pass_miscommunication,pass_no_touch,pass_outcome,pass_outswinging,pass_recipient,pass_recipient_id,pass_shot_assist,pass_straight,pass_switch,pass_technique,pass_through_ball,pass_type,play_pattern,position,possession,possession_team_id,shot_aerial_won,shot_body_part,shot_deflected,shot_end_location,shot_first_time,shot_follows_dribble,shot_freeze_frame,shot_key_pass_id,shot_one_on_one,shot_open_goal,shot_outcome,shot_redirect,shot_saved_off_target,shot_saved_to_post,shot_statsbomb_xg,shot_technique,shot_type,type,under_pressure
0,,,,,,False,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,,,3775648,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Play,,1,2647,,,,,,,,,,,,,,,,,,Starting XI,
1,,,,,,False,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,3775648,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Play,,1,2647,,,,,,,,,,,,,,,,,,Starting XI,
2,,,,,,False,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,,,,3775648,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Play,,1,2647,,,,,,,,,,,,,,,,,,Half Start,
3,,,,,,False,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,,,,3775648,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Play,,1,2647,,,,,,,,,,,,,,,,,,Half Start,
4,,,,,,False,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,1900,,,,3775648,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Play,,74,2647,,,,,,,,,,,,,,,,,,Half Start,


In [68]:
events_df_clean['goalkeeper_body_part'].value_counts()

Both Hands    586
Right Foot    139
Right Hand     76
Left Foot      73
Left Hand      44
Chest          14
Head            8
Name: goalkeeper_body_part, dtype: int64

In [None]:
events_df_clean['goalkeeper_lost_in_play'].value_counts() # goalie lost possession

True    1
Name: goalkeeper_lost_in_play, dtype: int64

In [None]:
events_df_clean['goalkeeper_lost_out'].value_counts() # goalie lost possession out of bounds

True    1
Name: goalkeeper_lost_out, dtype: int64

In [72]:
events_df_clean['goalkeeper_outcome'].value_counts()

Success            616
No Touch           357
In Play Danger     197
In Play Safe       142
Touched Out        140
Clear              110
Claim              102
Saved Twice         71
Touched In          39
Punched out         35
Fail                33
Collected Twice     16
Success In Play      6
Won                  4
Lost Out             1
Lost In Play         1
Name: goalkeeper_outcome, dtype: int64

In [74]:
events_df_clean['goalkeeper_punched_out'].value_counts()

True    35
Name: goalkeeper_punched_out, dtype: int64

In [75]:
events_df_clean['goalkeeper_saved_to_post'].value_counts()

True    1
Name: goalkeeper_saved_to_post, dtype: int64

In [76]:
events_df_clean['goalkeeper_shot_saved_off_target'].value_counts()

True    38
Name: goalkeeper_shot_saved_off_target, dtype: int64

In [77]:
events_df_clean['goalkeeper_shot_saved_to_post'].value_counts()

True    10
Name: goalkeeper_shot_saved_to_post, dtype: int64

In [78]:
events_df_clean['goalkeeper_success_in_play'].value_counts()

True    6
Name: goalkeeper_success_in_play, dtype: int64

In [80]:
events_df_clean['goalkeeper_type'].value_counts()

Shot Faced               2069
Shot Saved                725
Goal Conceded             369
Collected                 293
Keeper Sweeper            212
Punch                     166
Shot Saved Off Target      38
Penalty Conceded           27
Smother                    12
Shot Saved to Post         10
Penalty Saved               9
Save                        8
Saved to Post               1
Name: goalkeeper_type, dtype: int64