# Block 0 - Imports

In [1]:
import re
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import kagglehub

# Block 1 - Load Players Csv From Kaggle

In [2]:
def load_players_only() -> pd.DataFrame:
    ds_dir = Path(kagglehub.dataset_download("stoney71/aflstats"))
    players_path = ds_dir / "players.csv"
    df = pd.read_csv(players_path)
    print("Loaded players.csv shape", df.shape, flush=True)
    return df

players = load_players_only()

Loaded players.csv shape (1825, 6)


# Block 2 - Standardize Column Names

In [3]:
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    def _clean(c: str) -> str:
        c = c.strip()
        c = re.sub(r"[^\w]+", "_", c)
        c = re.sub(r"__+", "_", c)
        return c.strip("_")

    out = df.copy()
    out.columns = [_clean(c) for c in out.columns]
    return out

# Block 3 - Schema Overview Printer

In [4]:
def schema_overview(df: pd.DataFrame, name: str = "DATAFRAME", n: int = 5) -> None:
    print("\n" + "=" * 80, flush=True)
    print(f"{name} shape {df.shape}", flush=True)
    print("=" * 80, flush=True)
    print("Columns", list(df.columns), flush=True)
    print("\nDtypes\n", df.dtypes, flush=True)
    print("\nHead\n", df.head(n), flush=True)

# Block 4 - Apply Column Standardization And Print

In [5]:
print("Before", list(players.columns), flush=True)
players = standardize_columns(players)
print("After", list(players.columns), flush=True)

schema_overview(players, "PLAYERS Raw")

Before ['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position']
After ['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position']

PLAYERS Raw shape (1825, 6)
Columns ['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position']

Dtypes
 PlayerId       int64
PlayerName    object
Height         int64
Weight         int64
Dob           object
Position      object
dtype: object

Head
      PlayerId   PlayerName  Height  Weight         Dob  Position
0  2020654979   Jake Aarts     177      75  1994-12-08   Forward
1  2018655703  Ryan Abbott     200     100  1991-06-25      Ruck
2  2002652211  Gary Ablett     182      87  1984-05-14   Forward
3  2014651814  Blake Acres     191      90  1995-10-07  Midfield
4  2025654137    Jed Adams     196      91  2004-05-14  Defender


# Block 5 - Cast Types And Parse Dob

In [6]:
def players_cast_types(players_df: pd.DataFrame) -> pd.DataFrame:
    df = players_df.copy()

    if "PlayerId" in df.columns:
        df["PlayerId"] = pd.to_numeric(df["PlayerId"], errors="coerce").astype("Int64")

    if "PlayerName" in df.columns:
        df["PlayerName"] = df["PlayerName"].astype(str).str.strip()

    if "Height" in df.columns:
        df["Height"] = pd.to_numeric(df["Height"], errors="coerce")

    if "Weight" in df.columns:
        df["Weight"] = pd.to_numeric(df["Weight"], errors="coerce")

    if "Position" in df.columns:
        df["Position"] = df["Position"].astype(str).str.strip()

    if "Dob" in df.columns:
        df["Dob"] = pd.to_datetime(df["Dob"], errors="coerce")
        df["BirthYear"] = df["Dob"].dt.year
        df["BirthMonth"] = df["Dob"].dt.month
        df["BirthDay"] = df["Dob"].dt.day

    return df

# Block 6 - Key Checks For PlayerId

In [7]:
def players_key_checks(players_typed: pd.DataFrame) -> None:
    if "PlayerId" not in players_typed.columns:
        raise ValueError("Missing PlayerId")

    n_rows = len(players_typed)
    n_unique = players_typed["PlayerId"].nunique(dropna=True)
    dup_ids = players_typed.duplicated(subset=["PlayerId"]).sum()

    print("Rows", n_rows, flush=True)
    print("Unique PlayerId", int(n_unique), flush=True)
    print("Duplicate PlayerId rows", int(dup_ids), flush=True)

    if dup_ids > 0:
        sample = players_typed.loc[players_typed.duplicated(subset=["PlayerId"], keep=False)].head(20)
        print("Sample duplicate PlayerId rows\n", sample, flush=True)

# Block 7 - Missingness Report

In [8]:
def players_missingness(players_typed: pd.DataFrame) -> None:
    miss = players_typed.isna().mean().sort_values(ascending=False)
    print("Missing rate by column\n", miss, flush=True)

# Block 8 - Duplicate Row Check

In [9]:
def players_duplicate_rows(players_typed: pd.DataFrame) -> None:
    dup_all = players_typed.duplicated().sum()
    print("Exact duplicate rows", int(dup_all), flush=True)

# Block 9 - Sanity Checks For Height Weight Dob

In [10]:
def players_sanity_checks(players_typed: pd.DataFrame) -> None:
    if "Height" in players_typed.columns:
        h = players_typed["Height"]
        print("Height summary\n", h.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]), flush=True)
        print("Height out of range count", int(((h < 150) | (h > 230)).sum()), flush=True)

    if "Weight" in players_typed.columns:
        w = players_typed["Weight"]
        print("\nWeight summary\n", w.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]), flush=True)
        print("Weight out of range count", int(((w < 50) | (w > 140)).sum()), flush=True)

    if "Dob" in players_typed.columns:
        d = players_typed["Dob"]
        print("\nDob min", d.min(), flush=True)
        print("Dob max", d.max(), flush=True)
        print("Dob missing count", int(d.isna().sum()), flush=True)

# Block 10 - Clean Position Into PositionClean

In [11]:
def clean_player_position(players_typed: pd.DataFrame) -> pd.DataFrame:
    df = players_typed.copy()

    if "Position" not in df.columns:
        df["PositionClean"] = "Other"
        return df

    s = df["Position"].astype(str).str.strip().str.lower()
    s = s.replace({"nan": "", "none": "", "null": ""})

    def map_pos(x: str) -> str:
        x = x.strip()
        if x == "":
            return "Other"
        if "ruck" in x:
            return "Ruck"
        if "mid" in x:
            return "Midfield"
        if "for" in x:
            return "Forward"
        if "def" in x:
            return "Defender"
        return "Other"

    df["PositionClean"] = s.map(map_pos)
    return df

# Block 11 - Build Players Preprocessor

In [12]:
def build_players_preprocessor(players_clean: pd.DataFrame) -> Tuple[Pipeline, List[str], List[str]]:
    cat_cols = [c for c in ["PositionClean"] if c in players_clean.columns]
    num_candidates = ["Height", "Weight", "BirthYear", "BirthMonth", "BirthDay"]
    num_cols = [c for c in num_candidates if c in players_clean.columns]

    pre = ColumnTransformer(
        transformers=[
            ("num", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),
            ]), num_cols),
            ("cat", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ]), cat_cols),
        ],
        remainder="drop"
    )

    pipe = Pipeline(steps=[("pre", pre)])

    print("Numeric columns", num_cols, flush=True)
    print("One hot columns", cat_cols, flush=True)

    return pipe, num_cols, cat_cols

# Block 12 - Fix Impossible Height And Weight Values

In [13]:
def fix_height_weight_outliers(players_clean: pd.DataFrame) -> pd.DataFrame:
    df = players_clean.copy()

    if "Height" in df.columns:
        df.loc[(df["Height"] < 150) | (df["Height"] > 230), "Height"] = np.nan

    if "Weight" in df.columns:
        df.loc[(df["Weight"] < 50) | (df["Weight"] > 140), "Weight"] = np.nan

    print("Height NaNs after outlier fix:", int(df["Height"].isna().sum()), flush=True)
    print("Weight NaNs after outlier fix:", int(df["Weight"].isna().sum()), flush=True)
    return df

# Block 13 - Main Runner For Players Cleaning And Eda

In [14]:
def main_players_eda(players_raw: pd.DataFrame):
    schema_overview(players_raw, "PLAYERS Raw")

    players_typed = players_cast_types(players_raw)
    schema_overview(players_typed, "PLAYERS Typed")

    players_key_checks(players_typed)
    players_missingness(players_typed)
    players_duplicate_rows(players_typed)
    players_sanity_checks(players_typed)

    players_clean = clean_player_position(players_typed)
    players_clean = fix_height_weight_outliers(players_clean)

    if "PositionClean" in players_clean.columns:
        print("\nPositionClean distribution\n", players_clean["PositionClean"].value_counts(dropna=False), flush=True)

    players_preprocessor, num_cols, cat_cols = build_players_preprocessor(players_clean)

    return players_clean, players_preprocessor

# Block 14 - Run The Players Notebook Pipeline

In [15]:
players_clean, players_preprocessor = main_players_eda(players)


PLAYERS Raw shape (1825, 6)
Columns ['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position']

Dtypes
 PlayerId       int64
PlayerName    object
Height         int64
Weight         int64
Dob           object
Position      object
dtype: object

Head
      PlayerId   PlayerName  Height  Weight         Dob  Position
0  2020654979   Jake Aarts     177      75  1994-12-08   Forward
1  2018655703  Ryan Abbott     200     100  1991-06-25      Ruck
2  2002652211  Gary Ablett     182      87  1984-05-14   Forward
3  2014651814  Blake Acres     191      90  1995-10-07  Midfield
4  2025654137    Jed Adams     196      91  2004-05-14  Defender

PLAYERS Typed shape (1825, 9)
Columns ['PlayerId', 'PlayerName', 'Height', 'Weight', 'Dob', 'Position', 'BirthYear', 'BirthMonth', 'BirthDay']

Dtypes
 PlayerId               Int64
PlayerName            object
Height                 int64
Weight                 int64
Dob           datetime64[ns]
Position              object
BirthYear              i

# Block 15 - Fit And Transform Players Features

In [16]:
def make_players_feature_matrix(players_clean: pd.DataFrame, players_preprocessor: Pipeline):
    feature_df = players_clean.copy()

    drop_cols = [c for c in ["PlayerId", "PlayerName", "Dob", "Position"] if c in feature_df.columns]
    X_input = feature_df.drop(columns=drop_cols)

    X = players_preprocessor.fit_transform(X_input)

    try:
        feat_names = players_preprocessor.named_steps["pre"].get_feature_names_out()
        feat_names = [str(x) for x in feat_names]
    except Exception:
        feat_names = None

    print("X shape:", X.shape, flush=True)
    if feat_names is not None:
        print("Feature name sample:", feat_names[:15], flush=True)

    return X, feat_names

X_players, player_feature_names = make_players_feature_matrix(players_clean, players_preprocessor)

X shape: (1825, 10)
Feature name sample: ['num__Height', 'num__Weight', 'num__BirthYear', 'num__BirthMonth', 'num__BirthDay', 'cat__PositionClean_Defender', 'cat__PositionClean_Forward', 'cat__PositionClean_Midfield', 'cat__PositionClean_Other', 'cat__PositionClean_Ruck']


# Block 16 - Player BMI

In [17]:
def add_players_derived_features(players_clean: pd.DataFrame) -> pd.DataFrame:
    df = players_clean.copy()

    if "Height" in df.columns and "Weight" in df.columns:
        h_m = df["Height"] / 100.0
        df["BMI"] = df["Weight"] / (h_m ** 2)

    if "PositionClean" in df.columns:
        df["IsRuck"] = (df["PositionClean"] == "Ruck").astype(int)

    return df

players_clean = add_players_derived_features(players_clean)
print(players_clean[["Height", "Weight", "BMI", "PositionClean", "IsRuck"]].head(), flush=True)

   Height  Weight        BMI PositionClean  IsRuck
0   177.0    75.0  23.939481       Forward       0
1   200.0   100.0  25.000000          Ruck       1
2   182.0    87.0  26.264944       Forward       0
3   191.0    90.0  24.670376      Midfield       0
4   196.0    91.0  23.688047      Defender       0


# Block 17 - Save Processed Players And Feature Names

In [18]:
from pathlib import Path
import json

def save_players_outputs(players_clean: pd.DataFrame, feature_names: List[str], out_dir: str = "data/processed") -> None:
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    players_clean.to_parquet(out / "players_clean.parquet", index=False)

    with open(out / "players_feature_names.json", "w") as f:
        json.dump(feature_names, f, indent=2)

    print("Saved", out / "players_clean.parquet", flush=True)
    print("Saved", out / "players_feature_names.json", flush=True)

save_players_outputs(players_clean, player_feature_names)

Saved data/processed/players_clean.parquet
Saved data/processed/players_feature_names.json
