# Group 1 Throw-In Project

In [None]:
# pip install tqdm
# pip install statsbombpy

In [14]:
import pandas as pd
from statsbombpy import sb

from typing import Optional, Iterable
from statsbombpy import sb
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Import Womans Data

In [11]:
WomenSL = sb.matches(competition_id=37, season_id=90) 




In [None]:
def build_events_dataset(
    matches_df: pd.DataFrame,
    match_id_col: str = "match_id",
    save_csv: Optional[str] = None,
    keep_columns: Optional[Iterable[str]] = None,
    add_meta: bool = True,
    ) -> pd.DataFrame:

# Fetch events for all match_ids in `matches_df`, concatenate into one DataFrame,
# and (optionally) merge key match metadata columns.

# Parameters
# ----------
# matches_df : DataFrame containing at least `match_id`
# match_id_col : name of the match id column
# save_parquet : path to save a Parquet file (recommended)
# save_csv : path to save a CSV file
# keep_columns : iterable of event columns to keep (None = keep all)
# add_meta : merge match metadata columns (season, teams, date, week, competition)

# Returns
# -------
# DataFrame of all events with `match_id` and optional metadata.

    if match_id_col not in matches_df.columns:
        raise KeyError(f"`{match_id_col}` not in matches_df columns")


    meta_cols = [
    match_id_col,
    "season", "season_id", "season_name",
    "competition", "competition_id", "competition_name",
    "match_date", "match_week",
    "home_team", "away_team"
    ]
    meta = matches_df[[c for c in meta_cols if c in matches_df.columns]].drop_duplicates()

    all_frames = []
    failures = []

    mids = pd.unique(matches_df[match_id_col].astype("int64"))
    for mid in tqdm(mids, desc="Fetching events"):
        try:
            ev = sb.events(match_id=int(mid))
            ev[match_id_col] = int(mid)
            all_frames.append(ev)
        except Exception as e:
            failures.append((int(mid), str(e)))

    if not all_frames:
        raise RuntimeError("No events were loaded. Check your match_ids or environment.")

    events = pd.concat(all_frames, ignore_index=True)


    if keep_columns is not None:
        keep = list(set(keep_columns) | {match_id_col})
        events = events[[c for c in keep if c in events.columns]]


    if add_meta and not meta.empty:
        events = events.merge(meta, on=match_id_col, how="left")


    if save_csv:
        events.to_csv(save_csv, index=False)
        print(f"Saved events: {len(events):,} rows → {save_csv}")

    if failures:
        print(f"\n Skipped {len(failures)} matches due to errors (showing first 5):")
    for mid, msg in failures[:5]:
        print(f" match_id={mid}: {msg}")

    return events



events_df = build_events_dataset(
WomenSL,
keep_columns=None,
add_meta=True
)




In [20]:
print(events_df.shape)
print(events_df[["match_id", "season", "home_team", "away_team"]].drop_duplicates().head())

(443304, 122)
       match_id     season                   home_team  \
0       3775648  2020/2021                 Aston Villa   
3811    3775609  2020/2021                 Arsenal WFC   
7392    3775633  2020/2021                 Aston Villa   
10742   3775570  2020/2021  Brighton & Hove Albion WFC   
14444   3775581  2020/2021                 Chelsea FCW   

                     away_team  
0                  Arsenal WFC  
3811       West Ham United LFC  
7392   Tottenham Hotspur Women  
10742              Everton LFC  
14444              Aston Villa  
