In [None]:
# Part 1 — Load the Data and Pick Features

In this step we:
- read the `mydata.csv` file
- drop non-feature columns (`date`, `stadium`, team names, etc.)
- keep only numeric features
- set `class` as the target (values: h/a/d)
- fill any missing numeric values with median

In [None]:
# Part 1: load + select features

from pathlib import Path
import pandas as pd
import numpy as np

HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path(".")
DATA1 = HERE / "data" / "mydata.csv"
DATA2 = HERE.parent / "data" / "mydata.csv"
DATA = DATA1 if DATA1.exists() else DATA2

df = pd.read_csv(DATA)
print("loaded:", DATA)
print("shape:", df.shape)
print(df.head(3))

TARGET = "class"
drop_cols = [
    "date", "clock", "stadium", "attendance", "links",
    "Home Team", "Away Team"
]

y = df[TARGET].copy()
X = df.drop(columns=[TARGET] + [c for c in drop_cols if c in df.columns], errors="ignore")

X = X.select_dtypes(include=[np.number]).copy()
X = X.fillna(X.median(numeric_only=True))

print("features:", X.shape[1], "| rows:", X.shape[0])
print("class balance:", y.value_counts().to_dict())

# Part 2 — Label Encoding + Train/Test Split

- Turn the target labels (`h`, `a`, `d`) into numbers so the model can train.
- Save the mapping so we remember which number means what.
- Do an 80/20 **stratified** split to keep class balance.
- Keep it reproducible with a fixed `random_state`.

In [None]:
# Part 2: encode labels + split train/test

from pathlib import Path
import json
from sklearn.model_selection import train_test_split

OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)

# label encoding
classes = sorted(y.dropna().unique().tolist())  # e.g., ['a','d','h']
label_to_id = {c: i for i, c in enumerate(classes)}
id_to_label = {v: k for k, v in label_to_id.items()}
y_enc = y.map(label_to_id)

# save mapping for later
with open(OUT / "label_mapping.json", "w") as f:
    json.dump({"label_to_id": label_to_id, "id_to_label": id_to_label}, f, indent=2)

print("label_to_id:", label_to_id)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

print("train size:", len(y_train), "| test size:", len(y_test))
print("train class balance:", y_train.value_counts().to_dict())
print("test class balance:", y_test.value_counts().to_dict())