# Part 1 — Load the Data and Pick Features

In this step we:
- read the `mydata.csv` file
- drop non-feature columns (`date`, `stadium`, team names, etc.)
- keep only numeric features
- set `class` as the target (values: h/a/d)
- fill any missing numeric values with median

In [None]:
# Part 1: load + select features

from pathlib import Path
import pandas as pd
import numpy as np

HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path(".")
DATA1 = HERE / "data" / "mydata.csv"
DATA2 = HERE.parent / "data" / "mydata.csv"
DATA = DATA1 if DATA1.exists() else DATA2

df = pd.read_csv(DATA)
print("loaded:", DATA)
print("shape:", df.shape)
print(df.head(3))

TARGET = "class"
drop_cols = [
    "date", "clock", "stadium", "attendance", "links",
    "Home Team", "Away Team"
]

y = df[TARGET].copy()
X = df.drop(columns=[TARGET] + [c for c in drop_cols if c in df.columns], errors="ignore")

X = X.select_dtypes(include=[np.number]).copy()
X = X.fillna(X.median(numeric_only=True))

print("features:", X.shape[1], "| rows:", X.shape[0])
print("class balance:", y.value_counts().to_dict())

# Part 2 — Label Encoding + Train/Test Split

- Turn the target labels (`h`, `a`, `d`) into numbers so the model can train.
- Save the mapping so we remember which number means what.
- Do an 80/20 **stratified** split to keep class balance.
- Keep it reproducible with a fixed `random_state`.

In [None]:
# Part 2: encode labels + split train/test

from pathlib import Path
import json
from sklearn.model_selection import train_test_split

OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)

# label encoding
classes = sorted(y.dropna().unique().tolist())  # e.g., ['a','d','h']
label_to_id = {c: i for i, c in enumerate(classes)}
id_to_label = {v: k for k, v in label_to_id.items()}
y_enc = y.map(label_to_id)

# save mapping for later
with open(OUT / "label_mapping.json", "w") as f:
    json.dump({"label_to_id": label_to_id, "id_to_label": id_to_label}, f, indent=2)

print("label_to_id:", label_to_id)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

print("train size:", len(y_train), "| test size:", len(y_test))
print("train class balance:", y_train.value_counts().to_dict())
print("test class balance:", y_test.value_counts().to_dict())

# Part 3 — Train a Simple Decision Tree (Baseline)

- Fit a quick `DecisionTreeClassifier` on the training split.
- Check accuracy on the test split.
- Print a small classification report and the raw confusion matrix.

This is just a baseline to see if the features make sense before polishing.

In [None]:
# Part 3: train baseline tree + quick eval (print only)

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

clf = DecisionTreeClassifier(
    criterion="gini",  # try "entropy" later
    max_depth=None,  # let it grow; we'll regularize later
    min_samples_split=2,
    random_state=42
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"accuracy: {acc:.4f}")
print("\nclassification report:")
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(classes))),
    target_names=[id_to_label[i] for i in range(len(classes))]
))

cm = confusion_matrix(y_test, y_pred, labels=list(range(len(classes))))
print("confusion matrix (rows=true, cols=pred):")
print(cm)

# Part 4 — Save GitHub Artifacts (metrics + plots)

We’ll:
- dump a `metrics.json` with accuracy, sizes, and params
- save a `confusion_matrix.png`
- save a `tree_plot.png` (quick visual of the trained tree)
- save `feature_importances.json` to see which stats mattered

All files go to the `outputs/` folder so you can commit/push easily.

In [None]:
# Part 4: save metrics + plots for GitHub

import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import plot_tree

OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)

# metrics.json
report = classification_report(
    y_test,
    y_pred,
    labels=list(range(len(classes))),
    target_names=[id_to_label[i] for i in range(len(classes))],
    output_dict=True
)

metrics = {
    "accuracy": float((y_pred == y_test).mean()),
    "n_train": int(len(y_train)),
    "n_test": int(len(y_test)),
    "classes": classes,
    "params": clf.get_params(),
    "classification_report": report,
}
with open(OUT / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("saved:", OUT / "metrics.json")

# confusion_matrix.png
cm = confusion_matrix(y_test, y_pred, labels=list(range(len(classes))))

plt.figure(figsize=(5, 4))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
plt.xlabel("Predicted")
plt.ylabel("True")

# annotate cells
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha="center", va="center")
plt.tight_layout()
plt.savefig(OUT / "confusion_matrix.png", dpi=200)
plt.close()
print("saved:", OUT / "confusion_matrix.png")

# tree_plot.png
plt.figure(figsize=(14, 8))
plot_tree(
    clf,
    feature_names=X.columns.tolist(),
    class_names=[id_to_label[i] for i in range(len(classes))],
    filled=True,
    impurity=True,
    rounded=True,
)
plt.tight_layout()
plt.savefig(OUT / "tree_plot.png", dpi=200, bbox_inches="tight")
plt.close()
print("saved:", OUT / "tree_plot.png")

# feature_importances.json
importances = dict(zip(X.columns.tolist(), clf.feature_importances_.tolist()))
with open(OUT / "feature_importances.json", "w") as f:
    json.dump(importances, f, indent=2)
print("saved:", OUT / "feature_importances.json")

print("done. artifacts in:", OUT.resolve())