In [None]:
# Part 1 — Load the Data and Pick Features

In this step we:
- read the `mydata.csv` file
- drop non-feature columns (`date`, `stadium`, team names, etc.)
- keep only numeric features
- set `class` as the target (values: h/a/d)
- fill any missing numeric values with median

In [None]:
# Part 1: load + select features

from pathlib import Path
import pandas as pd
import numpy as np

HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path(".")
DATA1 = HERE / "data" / "mydata.csv"
DATA2 = HERE.parent / "data" / "mydata.csv"
DATA = DATA1 if DATA1.exists() else DATA2

df = pd.read_csv(DATA)
print("loaded:", DATA)
print("shape:", df.shape)
print(df.head(3))

TARGET = "class"
drop_cols = [
    "date", "clock", "stadium", "attendance", "links",
    "Home Team", "Away Team"
]

y = df[TARGET].copy()
X = df.drop(columns=[TARGET] + [c for c in drop_cols if c in df.columns], errors="ignore")

X = X.select_dtypes(include=[np.number]).copy()
X = X.fillna(X.median(numeric_only=True))

print("features:", X.shape[1], "| rows:", X.shape[0])
print("class balance:", y.value_counts().to_dict())