# Group 1 Throw-In Project

In [4]:
 %pip install tqdm
 %pip install statsbombpy

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
[0mSuccessfully installed tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting statsbombpy
  Downloading statsbombpy-1.16.0-py3-none-any.whl.metadata (63 kB)
Collecting requests-cache (from statsbombpy)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting inflect (from statsbombpy)
  Downloading inflect-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting more_itertools>=8.5.0 (from inflect->statsbombpy)
  Downloading more_itertools-10.8.0-py3-none-any.whl.metadata (39 kB)
Collecting typeguard>=4.0.1 (from inf

In [5]:
import pandas as pd
from statsbombpy import sb

from typing import Optional, Iterable
from statsbombpy import sb
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Import Womans Data

In [6]:
WomenSL = sb.matches(competition_id=37, season_id=90) 




In [7]:
def build_events_dataset(
    matches_df: pd.DataFrame,
    match_id_col: str = "match_id",
    save_csv: Optional[str] = None,
    keep_columns: Optional[Iterable[str]] = None,
    add_meta: bool = True,
    ) -> pd.DataFrame:

# Fetch events for all match_ids in `matches_df`, concatenate into one DataFrame,
# and (optionally) merge key match metadata columns.

# Parameters
# ----------
# matches_df : DataFrame containing at least `match_id`
# match_id_col : name of the match id column
# save_parquet : path to save a Parquet file (recommended)
# save_csv : path to save a CSV file
# keep_columns : iterable of event columns to keep (None = keep all)
# add_meta : merge match metadata columns (season, teams, date, week, competition)

# Returns
# -------
# DataFrame of all events with `match_id` and optional metadata.

    if match_id_col not in matches_df.columns:
        raise KeyError(f"`{match_id_col}` not in matches_df columns")


    meta_cols = [
    match_id_col,
    "season", "season_id", "season_name",
    "competition", "competition_id", "competition_name",
    "match_date", "match_week",
    "home_team", "away_team"
    ]
    meta = matches_df[[c for c in meta_cols if c in matches_df.columns]].drop_duplicates()

    all_frames = []
    failures = []

    mids = pd.unique(matches_df[match_id_col].astype("int64"))
    for mid in tqdm(mids, desc="Fetching events"):
        try:
            ev = sb.events(match_id=int(mid))
            ev[match_id_col] = int(mid)
            all_frames.append(ev)
        except Exception as e:
            failures.append((int(mid), str(e)))

    if not all_frames:
        raise RuntimeError("No events were loaded. Check your match_ids or environment.")

    events = pd.concat(all_frames, ignore_index=True)


    if keep_columns is not None:
        keep = list(set(keep_columns) | {match_id_col})
        events = events[[c for c in keep if c in events.columns]]


    if add_meta and not meta.empty:
        events = events.merge(meta, on=match_id_col, how="left")


    if save_csv:
        events.to_csv(save_csv, index=False)
        print(f"Saved events: {len(events):,} rows → {save_csv}")

    if failures:
        print(f"\n Skipped {len(failures)} matches due to errors (showing first 5):")
    for mid, msg in failures[:5]:
        print(f" match_id={mid}: {msg}")

    return events



events_df = build_events_dataset(
WomenSL,
keep_columns=None,
add_meta=True
)




Fetching events: 100%|██████████| 131/131 [00:49<00:00,  2.62it/s]


In [8]:
print(events_df.shape)
print(events_df[["match_id", "season", "home_team", "away_team"]].drop_duplicates().head())

(443304, 122)
       match_id     season                   home_team  \
0       3775648  2020/2021                 Aston Villa   
3811    3775609  2020/2021                 Arsenal WFC   
7392    3775633  2020/2021                 Aston Villa   
10742   3775570  2020/2021  Brighton & Hove Albion WFC   
14444   3775581  2020/2021                 Chelsea FCW   

                     away_team  
0                  Arsenal WFC  
3811       West Ham United LFC  
7392   Tottenham Hotspur Women  
10742              Everton LFC  
14444              Aston Villa  


In [11]:
# select rows where play_pattern is exactly 'From Throw In' and keep original events_df index
throws = events_df[events_df['play_pattern'] == 'From Throw In'].copy()
throws['event_idx'] = throws.index  # preserve original events_df index for lookup

In [12]:
# Provide a robust helper to convert an event's minute/second to absolute seconds
def _event_seconds(ev_row):
    """Return seconds from start of match for an event row (safe to missing values).
    ev_row is typically a pandas Series representing an events_df row.
    """
    # support dict-like or Series-like access
    if hasattr(ev_row, 'get'):
        minute = ev_row.get('minute', 0)
        second = ev_row.get('second', 0)
    else:
        minute = ev_row['minute'] if 'minute' in ev_row else 0
        second = ev_row['second'] if 'second' in ev_row else 0
    try:
        m = int(minute) if minute is not None else 0
    except Exception:
        m = 0
    try:
        s = int(second) if second is not None else 0
    except Exception:
        s = 0
    return m * 60 + s

# quick test (optional): uncomment to run a small check when executing this cell
# sample = {'minute': 12, 'second': 34}
# print(_event_seconds(sample))  # -> 754

In [None]:
# Build next_actions up to 3 actions (if not already present) and compute throw-level possession metrics
rows = []
for _, throw in throws.iterrows():
    mid = throw['match_id']
    throw_idx = throw['event_idx']
    throw_team = throw.get('team') if isinstance(throw.get('team'), dict) else throw.get('team')
    throw_name = throw_team.get('name') if isinstance(throw_team, dict) else throw_team
    throw_secs = throw.get('throw_seconds', None)
    later = events_df[(events_df['match_id'] == mid) & (events_df.index > throw_idx)].sort_index().head(3)
    for action_number, (_, act) in enumerate(later.iterrows(), start=1):
        action_team_obj = act.get('team')
        action_team = action_team_obj.get('name') if isinstance(action_team_obj, dict) else action_team_obj
        is_pass = isinstance(act.get('pass'), dict)
        pass_success = None
        if is_pass:
            p = act.get('pass')
            pass_success = ('outcome' not in p) if isinstance(p, dict) else None
        is_shot = isinstance(act.get('shot'), dict)
        is_goal = False
        if is_shot:
            s = act.get('shot')
            out = s.get('outcome') if isinstance(s, dict) else None
            if isinstance(out, dict):
                is_goal = (out.get('name') == 'Goal')
        action_secs = _event_seconds(act)
        time_from_throw = pd.NA
        try:
            if (throw_secs is not None) and (action_secs is not None):
                time_from_throw = action_secs - throw_secs
        except Exception:
            time_from_throw = pd.NA
        same_team = (action_team == throw_name)
        rows.append({
            'throw_event_idx': throw_idx,
            'match_id': mid,
            'throw_team': throw_name,
            'action_number': action_number,
            'action_index': act.name,
            'action_type': (act.get('type').get('name') if isinstance(act.get('type'), dict) else act.get('type')),
            'action_team': action_team,
            'action_seconds': action_secs,
            'time_from_throw': time_from_throw,
            'same_team_as_throw': same_team,
            'is_pass': bool(is_pass),
            'pass_success': pass_success,
            'is_shot': bool(is_shot),
            'is_goal': bool(is_goal)
        })

next_actions = pd.DataFrame(rows)

# Aggregate per throw_event_idx (i.e. per throw-in) to determine possession retention and other signals
def agg_throw(group):
    # group is rows for one throw-in, actions ordered by action_number
    group = group.sort_values('action_number')
    n_actions = len(group)
    # possession retained flags for first 1/2/3 actions (only True if those actions exist and are same team)
    retained1 = bool(group[group['action_number'] == 1]['same_team_as_throw'].all()) if n_actions >= 1 else pd.NA
    retained2 = bool(group[group['action_number'] <= 2]['same_team_as_throw'].all()) if n_actions >= 2 else pd.NA
    retained3 = bool(group[group['action_number'] <= 3]['same_team_as_throw'].all()) if n_actions >= 3 else pd.NA
    # where possession first lost (action_number) or NA if never lost in observed actions
    lost_actions = group[group['same_team_as_throw'] == False]['action_number']
    possession_lost_at = int(lost_actions.iloc[0]) if not lost_actions.empty else pd.NA
    # any shot/goal by throw team within observed actions
    any_shot = any((group['is_shot']) & (group['action_team'] == group['throw_team'].iloc[0]))
    any_goal = any((group['is_goal']) & (group['action_team'] == group['throw_team'].iloc[0]))
    # time to first shot by throw team
    shot_times = group[(group['is_shot']) & (group['action_team'] == group['throw_team'].iloc[0])]['time_from_throw'].dropna()
    time_to_first_shot = float(shot_times.min()) if not shot_times.empty else pd.NA
    mean_time = float(group['time_from_throw'].dropna().mean()) if not group['time_from_throw'].dropna().empty else pd.NA
    return pd.Series({
        'n_actions_observed': n_actions,
        'possession_retained_1': retained1,
        'possession_retained_2': retained2,
        'possession_retained_3': retained3,
        'possession_lost_at': possession_lost_at,
        'any_shot_by_throw_team': any_shot,
        'any_goal_by_throw_team': any_goal,
        'time_to_first_shot': time_to_first_shot,
        'mean_time_from_throw': mean_time
    })

throw_sequences = next_actions.groupby('throw_event_idx').apply(agg_throw).reset_index()

# join throw metadata (match_id, throw_team) from one action row if available or from throws df
meta = throws[['event_idx','match_id']].rename(columns={'event_idx':'throw_event_idx'})
throw_sequences = throw_sequences.merge(meta, on='throw_event_idx', how='left')
throw_sequences = throw_sequences.merge(throws[['event_idx','team']].rename(columns={'event_idx':'throw_event_idx','team':'throw_team_obj'}), on='throw_event_idx', how='left')
def _team_name(obj):
    try:
        return obj.get('name') if isinstance(obj, dict) else obj
    except Exception:
        return obj
throw_sequences['throw_team'] = throw_sequences['throw_team_obj'].apply(_team_name)
throw_sequences = throw_sequences.drop(columns=['throw_team_obj'])

# Example evaluator function: tune thresholds and rules here
def evaluate_throw_in(seq_row, require_retained_actions: int = 3, allow_shot_success: bool = True):
    # success if possession retained for required actions OR a shot/goal by throw team occurs in the window
    retained_key = f'possession_retained_{require_retained_actions}'
    retained = seq_row.get(retained_key)
    if retained is True:
        return True
    if allow_shot_success and seq_row.get('any_shot_by_throw_team'):
        return True
    return False

throw_sequences['success_default'] = throw_sequences.apply(lambda r: evaluate_throw_in(r, require_retained_actions=3, allow_shot_success=True), axis=1)

# Show samples
print('Action-level (next_actions) sample:')
display(next_actions.head(20))
print('\nThrow-level sequences (possession & shot metrics):')
display(throw_sequences.head(20))

# You can now filter throw_sequences by success_default or tune evaluate_throw_in to your criteria.