In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# /colab_notebooks/week4/main_week4.py
# Week 4 — Clean-slate baseline training with visible results.

from __future__ import annotations

import json
import os
import re
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier


# ----------------------------- Config ----------------------------- #

DRIVE_MOUNT_PT: str = "/content/drive"
PROJECT_ROOT: str = f"{DRIVE_MOUNT_PT}/MyDrive/Colab Notebooks/New_cyber_project"

W3_TRAIN_DIR: str = os.path.join(PROJECT_ROOT, "week_3", "outputs", "training_data")
W4_OUT: str = os.path.join(PROJECT_ROOT, "week_4", "outputs")
MODELS_DIR: str = os.path.join(W4_OUT, "models")
METRICS_DIR: str = os.path.join(W4_OUT, "metrics")
REPORTS_DIR: str = os.path.join(W4_OUT, "reports")
TABLES_DIR: str = os.path.join(W4_OUT, "tables")

DEFAULT_CFG: Dict = {
    "target": "label",
    "id_column": None,
    "test_size": 0.2,
    "random_state": 42,
    "class_weight": "balanced",
    "n_jobs": -1,
    "max_categories": 500,
    "rf": {"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1},
    "logreg": {"C": 1.0, "max_iter": 200},
}


# ----------------------------- Utilities ----------------------------- #

@dataclass(frozen=True)
class TrainArtifact:
    domain: str
    path: str


def _ensure_io() -> None:
    if not os.path.isdir(W3_TRAIN_DIR):
        raise FileNotFoundError(f"Missing training data dir: {W3_TRAIN_DIR}")
    for d in [W4_OUT, MODELS_DIR, METRICS_DIR, REPORTS_DIR, TABLES_DIR]:
        os.makedirs(d, exist_ok=True)


def _load_config() -> Dict:
    cfg_path = os.path.join(W4_OUT, "config.json")
    cfg = DEFAULT_CFG.copy()
    if os.path.isfile(cfg_path):
        on_disk = json.load(open(cfg_path))
        if not isinstance(on_disk, dict):
            raise ValueError("config.json must be a JSON object")
        for k, v in on_disk.items():
            if isinstance(v, dict) and isinstance(cfg.get(k), dict):
                cfg[k].update(v)
            else:
                cfg[k] = v
    return cfg


def _discover_training_sets() -> List[TrainArtifact]:
    arts: List[TrainArtifact] = []
    for fname in sorted(os.listdir(W3_TRAIN_DIR)):
        if fname.startswith("train_ready_") and fname.endswith(".parquet"):
            domain = re.sub(r"^train_ready_(.+)\.parquet$", r"\1", fname)
            arts.append(TrainArtifact(domain=domain, path=os.path.join(W3_TRAIN_DIR, fname)))
    if not arts:
        raise FileNotFoundError(f"No train_ready_*.parquet under {W3_TRAIN_DIR}")
    return arts


def _split_features(df: pd.DataFrame, target: str, id_column: Optional[str]) -> Tuple[pd.DataFrame, pd.Series]:
    if target not in df.columns:
        raise KeyError(f"Target column '{target}' not found.")
    y = df[target]
    drop_cols = [target]
    if id_column and id_column in df.columns:
        drop_cols.append(id_column)
    X = df.drop(columns=drop_cols)
    return X, y


def _drop_constants(df: pd.DataFrame) -> pd.DataFrame:
    nunique = df.nunique(dropna=False)
    const = nunique[nunique <= 1].index.tolist()
    return df.drop(columns=const) if const else df


def _infer_types(X: pd.DataFrame) -> Tuple[List[str], List[str]]:
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]
    return num_cols, cat_cols


def _build_pre(num_cols: List[str], cat_cols: List[str], max_categories: int) -> ColumnTransformer:
    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
    cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                         ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False, max_categories=max_categories))])
    return ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)], remainder="drop")


def _build_models(pre: ColumnTransformer, cfg: Dict) -> Dict[str, Pipeline]:
    logreg = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(
            solver="saga",
            penalty="l2",
            C=float(cfg["logreg"]["C"]),
            max_iter=int(cfg["logreg"]["max_iter"]),
            n_jobs=int(cfg["n_jobs"]),
            class_weight=cfg["class_weight"],
            random_state=int(cfg["random_state"]),
        )),
    ])
    rf = Pipeline([
        ("pre", pre),
        ("clf", RandomForestClassifier(
            n_estimators=int(cfg["rf"]["n_estimators"]),
            max_depth=cfg["rf"]["max_depth"],
            min_samples_leaf=int(cfg["rf"]["min_samples_leaf"]),
            n_jobs=int(cfg["n_jobs"]),
            class_weight=cfg["class_weight"],
            random_state=int(cfg["random_state"]),
        )),
    ])
    return {"logreg": logreg, "rf": rf}


def _safe_roc_auc(y_true: pd.Series, y_proba: Optional[np.ndarray]) -> Optional[float]:
    try:
        if y_proba is None:
            return None
        if y_proba.ndim == 1 or y_proba.shape[1] == 1:
            return None
        if y_proba.shape[1] == 2:
            return float(roc_auc_score(y_true, y_proba[:, 1]))
        return float(roc_auc_score(y_true, y_proba, multi_class="ovr", average="weighted"))
    except Exception:
        return None


def _eval(y_true: pd.Series, y_pred: np.ndarray, y_proba: Optional[np.ndarray]) -> Dict[str, Optional[float]]:
    m: Dict[str, Optional[float]] = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision_weighted": float(precision_score(y_true, y_pred, average="weighted", zero_division=0)),
        "recall_weighted": float(recall_score(y_true, y_pred, average="weighted", zero_division=0)),
        "f1_weighted": float(f1_score(y_true, y_pred, average="weighted", zero_division=0)),
        "roc_auc": _safe_roc_auc(y_true, y_proba),
        "log_loss": None,
    }
    try:
        if y_proba is not None:
            m["log_loss"] = float(log_loss(y_true, y_proba, labels=np.unique(y_true)))
    except Exception:
        pass
    return m


def _classification_report_df(y_true: pd.Series, y_pred: np.ndarray) -> pd.DataFrame:
    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return pd.DataFrame(rep).transpose()


def _persist(domain: str, model_name: str, pipe: Pipeline, metrics: Dict[str, Optional[float]], report_df: pd.DataFrame) -> None:
    joblib.dump(pipe, os.path.join(MODELS_DIR, f"{domain}_{model_name}.joblib"))
    with open(os.path.join(METRICS_DIR, f"{domain}_{model_name}.json"), "w") as f:
        json.dump(metrics, f, indent=2)
    report_df.to_csv(os.path.join(REPORTS_DIR, f"{domain}_{model_name}_classification_report.csv"), index=True)


def _train_one(df: pd.DataFrame, domain: str, cfg: Dict) -> List[Dict[str, object]]:
    df = _drop_constants(df.copy())
    X, y = _split_features(df, cfg["target"], cfg.get("id_column"))
    num_cols, cat_cols = _infer_types(X)
    pre = _build_pre(num_cols, cat_cols, int(cfg["max_categories"]))
    models = _build_models(pre, cfg)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=float(cfg["test_size"]), random_state=int(cfg["random_state"]), stratify=y
    )

    rows: List[Dict[str, object]] = []
    for name, pipe in models.items():
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            pipe.fit(X_tr, y_tr)

        y_pred = pipe.predict(X_te)
        try:
            y_proba = pipe.predict_proba(X_te)
        except Exception:
            y_proba = None

        met = _eval(y_te, y_pred, y_proba)
        cm = confusion_matrix(y_te, y_pred)
        rep_df = _classification_report_df(y_te, y_pred)

        _persist(domain, name, pipe, {**met, "n_test": int(len(y_te))}, rep_df)

        rows.append({
            "domain": domain,
            "model": name,
            **met,
            "n_train": int(len(y_tr)),
            "n_test": int(len(y_te)),
            "n_features_raw": int(X.shape[1]),
            "n_num": int(len(num_cols)),
            "n_cat": int(len(cat_cols)),
            "classes": ",".join(map(str, np.unique(y))),
            "confusion_matrix": ";".join(",".join(map(str, r)) for r in cm),
        })
    return rows


def _write_results_markdown(runs_df: pd.DataFrame, leaderboard: pd.DataFrame, out_path: str) -> None:
    lines: List[str] = []
    lines.append("# Week 4 Results\n")
    lines.append("## Leaderboard (best per domain)\n")
    lines.append(leaderboard.to_markdown(index=False))
    lines.append("\n## All Runs\n")
    show_cols = ["domain", "model", "f1_weighted", "accuracy", "precision_weighted", "recall_weighted", "roc_auc", "log_loss", "n_train", "n_test"]
    lines.append(runs_df[show_cols].sort_values(["domain", "f1_weighted", "accuracy"], ascending=[True, False, False]).to_markdown(index=False))
    lines.append("\n---\n**Next**: inspect per-model CSV reports under `week_4/outputs/reports/`.\n")
    with open(out_path, "w") as f:
        f.write("\n".join(lines))


def main() -> None:
    _ensure_io()
    cfg = _load_config()
    arts = _discover_training_sets()

    all_rows: List[Dict[str, object]] = []
    for art in arts:
        print(f"[Week4] Training on {art.domain}")
        df = pd.read_parquet(art.path)
        all_rows.extend(_train_one(df, art.domain, cfg))

    runs_df = pd.DataFrame(all_rows)
    runs_csv = os.path.join(TABLES_DIR, "runs.csv")
    runs_df.to_csv(runs_csv, index=False)

    leaderboard = (
        runs_df.sort_values(["domain", "f1_weighted", "accuracy"], ascending=[True, False, False])
        .groupby("domain", as_index=False)
        .first()
    )
    lb_csv = os.path.join(TABLES_DIR, "leaderboard.csv")
    leaderboard.to_csv(lb_csv, index=False)

    results_md = os.path.join(W4_OUT, "results.md")
    _write_results_markdown(runs_df, leaderboard, results_md)

    print("\n[Week4] Artifacts:")
    print(f" - models:   {MODELS_DIR}")
    print(f" - metrics:  {METRICS_DIR}")
    print(f" - reports:  {REPORTS_DIR}")
    print(f" - tables:   {TABLES_DIR}")
    print(f" - summary:  {results_md}")
    print("[Week4] Done.")


if __name__ == "__main__":
    main()


[Week4] Training on ALL
[Week4] Training on IoT
[Week4] Training on Linux
[Week4] Training on Network
[Week4] Training on Windows

[Week4] Artifacts:
 - models:   /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/models
 - metrics:  /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/metrics
 - reports:  /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/reports
 - tables:   /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/tables
 - summary:  /content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/results.md
[Week4] Done.


In [None]:
# /colab_notebooks/week4/main_week4.py
# Week 4 — Baseline training with clean code (PEP8, type hints, logging), single file.

from __future__ import annotations

import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

DRIVE_MOUNT_PT = Path("/content/drive")
PROJECT_ROOT = DRIVE_MOUNT_PT / "MyDrive" / "Colab Notebooks" / "New_cyber_project"

W3_TRAIN_DIR = PROJECT_ROOT / "week_3" / "outputs" / "training_data"
W4_OUT = PROJECT_ROOT / "week_4" / "outputs"
MODELS_DIR = W4_OUT / "models"
METRICS_DIR = W4_OUT / "metrics"
REPORTS_DIR = W4_OUT / "reports"
TABLES_DIR = W4_OUT / "tables"
RESULTS_MD = W4_OUT / "results.md"


@dataclass(frozen=True)
class TrainFile:
    """Immutable descriptor of a train-ready dataset."""
    domain: str
    path: Path


@dataclass(frozen=True)
class TrainConfig:
    """Training configuration (WHY: explicit, typed settings to avoid hidden defaults)."""
    target: str = "label"
    id_column: Optional[str] = None
    test_size: float = 0.20
    random_state: int = 42
    class_weight: Optional[str] = "balanced"
    n_jobs: int = -1
    max_categories: int = 500
    rf_n_estimators: int = 300
    rf_max_depth: Optional[int] = None
    rf_min_samples_leaf: int = 1
    logreg_C: float = 1.0
    logreg_max_iter: int = 200


# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

def setup_logging() -> None:
    """Set structured, concise logging (WHY: mentor-friendly visibility)."""
    fmt = "%(asctime)s | %(levelname)s | %(message)s"
    logging.basicConfig(level=logging.INFO, format=fmt)
    # Silence verbose deps
    for noisy in ("urllib3", "fsspec", "numexpr"):
        logging.getLogger(noisy).setLevel(logging.WARNING)


# ---------------------------------------------------------------------------
# IO + Config
# ---------------------------------------------------------------------------

def ensure_io() -> None:
    """Ensure Week-3 exists and Week-4 output dirs are present."""
    if not W3_TRAIN_DIR.is_dir():
        raise FileNotFoundError(f"Missing training data dir: {W3_TRAIN_DIR}")
    for d in (W4_OUT, MODELS_DIR, METRICS_DIR, REPORTS_DIR, TABLES_DIR):
        d.mkdir(parents=True, exist_ok=True)


def load_config() -> TrainConfig:
    """
    Load config from `week_4/outputs/config.json` if present and merge over defaults.
    WHY: explicit config keeps runs reproducible and reviewable.
    """
    cfg_path = W4_OUT / "config.json"
    cfg = TrainConfig()
    if not cfg_path.exists():
        return cfg

    with cfg_path.open("r", encoding="utf-8") as f:
        raw: Dict[str, object] = json.load(f)

    # Shallow mapping with validation of known keys
    mapping = {
        "target": "target",
        "id_column": "id_column",
        "test_size": "test_size",
        "random_state": "random_state",
        "class_weight": "class_weight",
        "n_jobs": "n_jobs",
        "max_categories": "max_categories",
        # nested compat
        "rf": None,
        "logreg": None,
        "rf_n_estimators": "rf_n_estimators",
        "rf_max_depth": "rf_max_depth",
        "rf_min_samples_leaf": "rf_min_samples_leaf",
        "logreg_C": "logreg_C",
        "logreg_max_iter": "logreg_max_iter",
    }

    # Start from defaults
    data = cfg.__dict__.copy()

    # Flatten possible nested rf/logreg blocks
    rf_block = raw.get("rf") if isinstance(raw, dict) else None
    logreg_block = raw.get("logreg") if isinstance(raw, dict) else None
    if isinstance(rf_block, dict):
        raw = {**raw, "rf_n_estimators": rf_block.get("n_estimators", data["rf_n_estimators"]),
               "rf_max_depth": rf_block.get("max_depth", data["rf_max_depth"]),
               "rf_min_samples_leaf": rf_block.get("min_samples_leaf", data["rf_min_samples_leaf"])}
    if isinstance(logreg_block, dict):
        raw = {**raw, "logreg_C": logreg_block.get("C", data["logreg_C"]),
               "logreg_max_iter": logreg_block.get("max_iter", data["logreg_max_iter"])}

    for k, v in raw.items():
        if k in mapping and mapping[k]:
            data[mapping[k]] = v

    return TrainConfig(**data)


def discover_training_sets() -> List[TrainFile]:
    """Find `train_ready_*.parquet` (WHY: domain and ALL handled uniformly)."""
    files: List[TrainFile] = []
    for p in sorted(W3_TRAIN_DIR.glob("train_ready_*.parquet")):
        domain = p.stem.replace("train_ready_", "", 1)
        files.append(TrainFile(domain=domain, path=p))
    if not files:
        raise FileNotFoundError(f"No train_ready_*.parquet under {W3_TRAIN_DIR}")
    return files


# ---------------------------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------------------------

def split_features(df: pd.DataFrame, target: str, id_column: Optional[str]) -> Tuple[pd.DataFrame, pd.Series]:
    """Split X/y and drop id column if present."""
    if target not in df.columns:
        raise KeyError(f"Target column '{target}' not found.")
    y = df[target]
    drop_cols: List[str] = [target]
    if id_column and id_column in df.columns:
        drop_cols.append(id_column)
    X = df.drop(columns=drop_cols)
    return X, y


def drop_constant_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Remove constant columns (WHY: avoid degenerate features and wasted OHE width)."""
    nunique = df.nunique(dropna=False)
    to_drop = nunique.index[nunique <= 1].tolist()
    return df.drop(columns=to_drop) if to_drop else df


def infer_types(X: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """Infer numeric vs categorical columns."""
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]
    return num_cols, cat_cols


def make_ohe(max_categories: int) -> OneHotEncoder:
    """
    Create OneHotEncoder with sklearn version compatibility.
    WHY: Colab sklearn versions differ; use sparse_output when available.
    """
    try:
        return OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False,   # sklearn >=1.2
            max_categories=max_categories,
        )
    except TypeError:
        # Fallback for older sklearn
        return OneHotEncoder(
            handle_unknown="ignore",
            sparse=False,
            # max_categories not supported in very old versions; ignore if needed
        )


def build_preprocessor(num_cols: Sequence[str], cat_cols: Sequence[str], max_categories: int) -> ColumnTransformer:
    """Column-wise preprocessing pipeline."""
    num_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    cat_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", make_ohe(max_categories=max_categories)),
        ]
    )
    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, list(num_cols)),
            ("cat", cat_pipe, list(cat_cols)),
        ],
        remainder="drop",
    )


# ---------------------------------------------------------------------------
# Models + Metrics
# ---------------------------------------------------------------------------

def build_models(pre: ColumnTransformer, cfg: TrainConfig) -> Dict[str, Pipeline]:
    """Define baseline model pipelines (WHY: strong, fast baselines)."""
    logreg = Pipeline(
        steps=[
            ("pre", pre),
            ("clf", LogisticRegression(
                solver="saga",
                penalty="l2",
                C=float(cfg.logreg_C),
                max_iter=int(cfg.logreg_max_iter),
                n_jobs=int(cfg.n_jobs),
                class_weight=cfg.class_weight,
                random_state=int(cfg.random_state),
            )),
        ]
    )
    rf = Pipeline(
        steps=[
            ("pre", pre),
            ("clf", RandomForestClassifier(
                n_estimators=int(cfg.rf_n_estimators),
                max_depth=cfg.rf_max_depth,
                min_samples_leaf=int(cfg.rf_min_samples_leaf),
                n_jobs=int(cfg.n_jobs),
                class_weight=cfg.class_weight,
                random_state=int(cfg.random_state),
            )),
        ]
    )
    return {"logreg": logreg, "rf": rf}


def safe_roc_auc(y_true: pd.Series, y_proba: Optional[np.ndarray]) -> Optional[float]:
    """ROC-AUC for binary/multiclass (WHY: robust to missing proba)."""
    if y_proba is None:
        return None
    try:
        if y_proba.ndim == 1 or y_proba.shape[1] == 1:
            return None
        if y_proba.shape[1] == 2:
            return float(roc_auc_score(y_true, y_proba[:, 1]))
        return float(roc_auc_score(y_true, y_proba, multi_class="ovr", average="weighted"))
    except Exception:
        return None


def evaluate_metrics(y_true: pd.Series, y_pred: np.ndarray, y_proba: Optional[np.ndarray]) -> Dict[str, Optional[float]]:
    """Compact, stable metric set."""
    metrics: Dict[str, Optional[float]] = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision_weighted": float(precision_score(y_true, y_pred, average="weighted", zero_division=0)),
        "recall_weighted": float(recall_score(y_true, y_pred, average="weighted", zero_division=0)),
        "f1_weighted": float(f1_score(y_true, y_pred, average="weighted", zero_division=0)),
        "roc_auc": safe_roc_auc(y_true, y_proba),
        "log_loss": None,
    }
    try:
        if y_proba is not None:
            metrics["log_loss"] = float(log_loss(y_true, y_proba, labels=np.unique(y_true)))
    except Exception:
        # WHY: older sklearn/proba edge cases; do not fail the run.
        pass
    return metrics


def classification_report_df(y_true: pd.Series, y_pred: np.ndarray) -> pd.DataFrame:
    """Turn sklearn report into DataFrame for CSV exporting."""
    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return pd.DataFrame(rep).transpose()


# ---------------------------------------------------------------------------
# Persistence
# ---------------------------------------------------------------------------

def persist_artifacts(
    domain: str,
    model_name: str,
    model: Pipeline,
    metrics: Dict[str, Optional[float]],
    report_df: pd.DataFrame,
) -> None:
    """Persist model + metrics + report (WHY: reproducible artifacts)."""
    joblib.dump(model, MODELS_DIR / f"{domain}_{model_name}.joblib")
    (METRICS_DIR / f"{domain}_{model_name}.json").write_text(
        json.dumps(metrics, indent=2), encoding="utf-8"
    )
    report_df.to_csv(REPORTS_DIR / f"{domain}_{model_name}_classification_report.csv", index=True)


# ---------------------------------------------------------------------------
# Training Loop
# ---------------------------------------------------------------------------

def train_on_dataframe(df: pd.DataFrame, domain: str, cfg: TrainConfig) -> List[Dict[str, object]]:
    """Train/evaluate both baselines on a single domain; return leaderboard rows."""
    df = drop_constant_columns(df.copy())
    X, y = split_features(df, cfg.target, cfg.id_column)
    num_cols, cat_cols = infer_types(X)
    pre = build_preprocessor(num_cols, cat_cols, cfg.max_categories)
    models = build_models(pre, cfg)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y,
        test_size=cfg.test_size,
        random_state=cfg.random_state,
        stratify=y,
    )

    rows: List[Dict[str, object]] = []
    for name, pipe in models.items():
        pipe.fit(X_tr, y_tr)

        y_pred = pipe.predict(X_te)
        try:
            y_proba = pipe.predict_proba(X_te)
        except Exception:
            y_proba = None

        met = evaluate_metrics(y_te, y_pred, y_proba)
        cm = confusion_matrix(y_te, y_pred)
        report_df = classification_report_df(y_te, y_pred)

        persist_artifacts(domain, name, pipe, {**met, "n_test": int(len(y_te))}, report_df)

        rows.append(
            {
                "domain": domain,
                "model": name,
                **met,
                "n_train": int(len(y_tr)),
                "n_test": int(len(y_te)),
                "n_features_raw": int(X.shape[1]),
                "n_num": int(len(num_cols)),
                "n_cat": int(len(cat_cols)),
                "classes": ",".join(map(str, np.unique(y))),
                "confusion_matrix": ";".join(",".join(map(str, r)) for r in cm),
            }
        )

    return rows


# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------

def write_results_markdown(runs_df: pd.DataFrame, leaderboard: pd.DataFrame) -> None:
    """Compact summary for human inspection (WHY: quick wins without notebooks)."""
    lines: List[str] = []
    lines.append("# Week 4 Results\n")
    lines.append("## Leaderboard (best per domain)\n")
    lines.append(leaderboard.to_markdown(index=False))
    lines.append("\n## All Runs\n")
    keep = [
        "domain", "model", "f1_weighted", "accuracy",
        "precision_weighted", "recall_weighted", "roc_auc", "log_loss",
        "n_train", "n_test",
    ]
    runs_tbl = (
        runs_df[keep]
        .sort_values(["domain", "f1_weighted", "accuracy"], ascending=[True, False, False])
        .to_markdown(index=False)
    )
    lines.append(runs_tbl)
    lines.append("\n---\nNext: inspect per-model CSV reports under `week_4/outputs/reports/`.\n")
    RESULTS_MD.write_text("\n".join(lines), encoding="utf-8")


# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------

def run() -> None:
    """End-to-end orchestrator (WHY: single entrypoint for clarity)."""
    setup_logging()
    ensure_io()
    cfg = load_config()
    files = discover_training_sets()

    logging.info("Discovered %d dataset(s).", len(files))

    rows: List[Dict[str, object]] = []
    for tf in files:
        logging.info("Training on domain '%s' …", tf.domain)
        try:
            df = pd.read_parquet(tf.path)
            rows.extend(train_on_dataframe(df, tf.domain, cfg))
            logging.info("Done: %s", tf.domain)
        except Exception as exc:
            # WHY: keep the run going if a single domain fails.
            logging.exception("Failed on domain '%s': %s", tf.domain, exc)

    if not rows:
        raise RuntimeError("No successful runs; aborting.")

    runs_df = pd.DataFrame(rows)
    runs_df.to_csv(TABLES_DIR / "runs.csv", index=False)

    leaderboard = (
        runs_df.sort_values(["domain", "f1_weighted", "accuracy"], ascending=[True, False, False])
        .groupby("domain", as_index=False)
        .first()
    )
    leaderboard.to_csv(TABLES_DIR / "leaderboard.csv", index=False)

    write_results_markdown(runs_df, leaderboard)

    logging.info("Artifacts:")
    logging.info("  models:   %s", MODELS_DIR)
    logging.info("  metrics:  %s", METRICS_DIR)
    logging.info("  reports:  %s", REPORTS_DIR)
    logging.info("  tables:   %s", TABLES_DIR)
    logging.info("  summary:  %s", RESULTS_MD)
    logging.info("Week 4 complete.")


if __name__ == "__main__":
    run()


In [5]:
# 2) Read the summary (results.md)
print(open("/content/drive/MyDrive/Colab Notebooks/New_cyber_project/week_4/outputs/results.md", "r", encoding="utf-8").read())


# Week 4 Results

## Leaderboard (best per domain)

| domain   | model   |   accuracy |   precision_weighted |   recall_weighted |   f1_weighted |   roc_auc |    log_loss |   n_train |   n_test |   n_features_raw |   n_num |   n_cat | classes   | confusion_matrix    |
|:---------|:--------|-----------:|---------------------:|------------------:|--------------:|----------:|------------:|----------:|---------:|-----------------:|--------:|--------:|:----------|:--------------------|
| ALL      | logreg  |          1 |                    1 |                 1 |             1 |         1 | 2.44302e-06 |  17491860 |  4372966 |                2 |       0 |       2 | 0,1       | 1708807,0;0,2664159 |
| IoT      | logreg  |          1 |                    1 |                 1 |             1 |         1 | 1.3175e-05  |   2884907 |   721227 |                1 |       0 |       1 | 0,1       | 617395,0;0,103832   |
| Linux    | logreg  |          1 |                    1 |                 1 |  