In [3]:
#!/usr/bin/env python3
"""
svm_knn_trainer_fast.py

Lean & fast SVM + KNN trainer:
- Auto-derives labels from price if y_* look like dates
- Binary or 3-class (terciles on TRAIN only)
- SVM (RBF + Poly-3) and KNN with *randomized* hyperparameter search
- Train-only feature pruning (low-variance + high-corr)
- Optional time-series CV
- Optional SVM probabilities (off by default for speed)

Use --fast to activate the lighter randomized search + 3-fold CV.

Outputs -> artifacts/
  SVM_predictions.csv, SVM_report.txt, SVM_confusion_matrix.csv
  KNN_predictions.csv, KNN_report.txt, KNN_confusion_matrix.csv
  meta.json
"""

import argparse, json, os, sys
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC

# ---------------- CLI ----------------
def parse_args(argv=None):
    ap = argparse.ArgumentParser(description="Fast SVM & KNN trainer with randomized search.", add_help=True)
    ap.add_argument("--x-train", default="X_train.csv")
    ap.add_argument("--x-test",  default="X_test.csv")
    ap.add_argument("--y-train", default="y_train.csv")
    ap.add_argument("--y-test",  default="y_test.csv")
    ap.add_argument("--y-train-col", default=None, help="Force label col in y_train.csv (optional)")
    ap.add_argument("--y-test-col",  default=None, help="Force label col in y_test.csv (optional)")

    # derive labels when y_* unusable
    ap.add_argument("--derive-labels", action="store_true", help="Derive labels from X price columns (ignores y_*)")
    ap.add_argument("--price-col", default=None)
    ap.add_argument("--horizon", type=int, default=5)
    ap.add_argument("--three-class", action="store_true")

    # pruning
    ap.add_argument("--corr-threshold", type=float, default=0.98)
    ap.add_argument("--min-std", type=float, default=0.0)

    # CV style
    ap.add_argument("--cv", choices=["stratified", "timeseries"], default="stratified")

    # scoring
    ap.add_argument("--scoring", default=None)

    # speed flags
    ap.add_argument("--fast", action="store_true", help="Use randomized search with smaller spaces and 3-fold CV")
    ap.add_argument("--probabilities", action="store_true", help="Enable SVM(probability=True) (slower)")

    ap.add_argument("--outdir", default="artifacts")

    # notebook-safe
    if argv is None:
        args, _ = ap.parse_known_args()
    else:
        args, _ = ap.parse_known_args(argv)
    return args

# ---------------- helpers ----------------
DATE_HINTS = ("date","time","timestamp","day")
PRICE_CANDIDATES = ["Adj Close","Adj_Close","adj_close","Close","close"]

def is_date_like(s: pd.Series) -> bool:
    name = (s.name or "").lower()
    if any(k in name for k in DATE_HINTS): return True
    if s.dtype == "O":
        sample = s.dropna().astype(str).head(50)
        if len(sample) and sample.str.match(r"^\d{4}[-/]\d{2}[-/]\d{2}").mean() > 0.6:
            return True
    try:
        parsed = pd.to_datetime(s, errors="coerce")
        return parsed.notna().mean() > 0.6
    except Exception:
        return False

def pick_label_col(df: pd.DataFrame) -> Optional[str]:
    pref = ["y","label","class","target","signal"]
    for p in pref:
        for c in df.columns:
            if c.lower()==p: return c
    candidates = [c for c in df.columns if not is_date_like(df[c])]
    if not candidates: return None
    best, best_u = None, None
    for c in candidates:
        u = df[c].nunique(dropna=True)
        if 2 <= u <= 10 and (best is None or u < best_u):
            best, best_u = c, u
    return best or candidates[0]

def select_numeric(Xtr, Xte):
    Xtrn = Xtr.select_dtypes(include=["number"]).copy()
    Xten = Xte.select_dtypes(include=["number"]).copy()
    common = [c for c in Xtrn.columns if c in Xten.columns]
    if not common:
        raise ValueError("No overlapping numeric feature columns between X_train and X_test.")
    return Xtrn[common], Xten[common], common

def prune_features_train_only(Xtr_num: pd.DataFrame, Xte_num: pd.DataFrame, min_std: float, corr_thr: float):
    stds = Xtr_num.std(ddof=0)
    keep = stds[stds > min_std].index.tolist()
    Xtr_f = Xtr_num[keep].copy()
    Xte_f = Xte_num[keep].copy()

    if Xtr_f.shape[1] > 1:
        corr = Xtr_f.corr().abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        to_drop = [col for col in upper.columns if (upper[col] >= corr_thr).any()]
    else:
        to_drop = []
    kept = [c for c in Xtr_f.columns if c not in to_drop]
    return Xtr_f[kept], Xte_f[kept], kept, {"dropped_low_variance": [c for c in Xtr_num.columns if c not in keep],
                                            "dropped_high_corr": to_drop}

def encode_labels(ytr, yte):
    if ytr.dtype == "O" or yte.dtype == "O":
        le = LabelEncoder()
        ytr_enc = pd.Series(le.fit_transform(ytr), name="y")
        unknown = set(yte.unique()) - set(le.classes_)
        if unknown:
            raise ValueError(f"y_test has unseen labels vs y_train, e.g. {list(sorted(unknown))[:5]}")
        yte_enc = pd.Series(le.transform(yte), name="y")
        return ytr_enc, yte_enc, le.classes_.tolist()
    return pd.Series(ytr.values, name="y"), pd.Series(yte.values, name="y"), sorted(pd.unique(ytr))

def find_price_col(df: pd.DataFrame, override: Optional[str]=None) -> str:
    if override:
        if override not in df.columns: raise ValueError(f"--price-col '{override}' not in X.")
        return override
    for c in PRICE_CANDIDATES:
        if c in df.columns: return c
    for c in df.columns:
        if "close" in c.lower(): return c
    raise ValueError("No price column found.")

def derive_labels_from_price(Xtr_raw, Xte_raw, price_col, H=5, three_class=False):
    Xtr = Xtr_raw.copy(); Xte = Xte_raw.copy()
    def fwd_ret(x, h): return x.shift(-h)/x - 1.0
    ytr_c = fwd_ret(Xtr[price_col], H)
    yte_c = fwd_ret(Xte[price_col], H)
    vt, ve = ytr_c.notna(), yte_c.notna()
    Xtr, Xte = Xtr.loc[vt], Xte.loc[ve]
    ytr_c, yte_c = ytr_c.loc[vt], yte_c.loc[ve]
    if three_class:
        q1, q3 = ytr_c.quantile([0.25, 0.75])
        def terc(r): return -1 if r <= q1 else (1 if r >= q3 else 0)
        ytr = ytr_c.apply(terc); yte = yte_c.apply(terc)
    else:
        ytr = (ytr_c > 0).astype(int); yte = (yte_c > 0).astype(int)
    return Xtr, Xte, ytr, yte

# ---------------- training ----------------
def fit_eval_fast(Xtr_num, Xte_num, ytr, yte, feature_cols, scoring, outdir, cv_type, probs):
    n_classes = len(np.unique(ytr))
    scoring = scoring or ("f1_macro" if n_classes>2 else "f1")

    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                         ("scaler", StandardScaler())])
    preprocess = ColumnTransformer([("num", num_pipe, feature_cols)], remainder="drop")

    # CV
    cv = TimeSeriesSplit(n_splits=3) if cv_type=="timeseries" else StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # ---------- SVM ----------
    svm = Pipeline([("prep", preprocess), ("clf", SVC(probability=probs))])

    # Randomized param dists (compact, effective)
    from scipy.stats import loguniform
    svm_search_space = [
        {   # RBF
            "clf__kernel": ["rbf"],
            "clf__C": loguniform(1e-1, 1e1),        # ~0.1..10
            "clf__gamma": ["scale", 1e-2, 5e-2],    # small set
            "clf__class_weight": [None, "balanced"]
        },
        {   # Poly-3
            "clf__kernel": ["poly"],
            "clf__degree": [3],
            "clf__C": loguniform(1e-1, 1e1),
            "clf__gamma": ["scale", 1e-2],
            "clf__coef0": [0.0, 1.0],
            "clf__class_weight": [None, "balanced"]
        }
    ]
    svm_rs = RandomizedSearchCV(
        svm, svm_search_space, n_iter=12, cv=cv, scoring=scoring,
        n_jobs=-1, refit=True, verbose=0, random_state=13
    )

    # ---------- KNN ----------
    knn = Pipeline([("prep", preprocess), ("clf", KNeighborsClassifier())])
    knn_search_space = {
        "clf__n_neighbors": [11, 31, 51, 101],
        "clf__weights": ["uniform", "distance"],
        "clf__p": [1, 2],              # Manhattan vs Euclidean
        "clf__leaf_size": [20, 30, 40] # minor speed/accuracy tweak
    }
    knn_rs = RandomizedSearchCV(
        knn, knn_search_space, n_iter=10, cv=cv, scoring=scoring,
        n_jobs=-1, refit=True, verbose=0, random_state=13
    )

    print(f"[INFO] RandomizedSearchCV (SVM) with {svm_rs.n_iter} samples; scoring={scoring}")
    svm_rs.fit(Xtr_num, ytr)
    print(f"[INFO] RandomizedSearchCV (KNN) with {knn_rs.n_iter} samples; scoring={scoring}")
    knn_rs.fit(Xtr_num, ytr)

    def evaluate(model, name):
        yhat = model.predict(Xte_num)
        acc = accuracy_score(yte, yhat)
        bal = balanced_accuracy_score(yte, yhat)
        f1m = f1_score(yte, yhat, average="macro" if n_classes>2 else "binary")
        rep = classification_report(yte, yhat, digits=3)
        cm  = confusion_matrix(yte, yhat)
        os.makedirs(outdir, exist_ok=True)
        pd.DataFrame({"y_true": yte, "y_pred": yhat}).to_csv(os.path.join(outdir, f"{name}_predictions.csv"), index=False)
        with open(os.path.join(outdir, f"{name}_report.txt"), "w") as f: f.write(rep)
        pd.DataFrame(cm).to_csv(os.path.join(outdir, f"{name}_confusion_matrix.csv"), index=False)
        print("\n" + "="*80)
        print(f"{name} best_params: {model.best_params_}")
        print(f"Accuracy={acc:.4f}  BalancedAcc={bal:.4f}  F1({'macro' if n_classes>2 else 'binary'})={f1m:.4f}")
        print("Confusion matrix:\n", cm)
        print("Classification report:\n", rep)
        return {"best_params": model.best_params_, "accuracy": acc, "balanced_accuracy": bal, "f1_macro": f1m}

    svm_metrics = evaluate(svm_rs, "SVM")
    knn_metrics = evaluate(knn_rs, "KNN")
    return n_classes, svm_metrics, knn_metrics, svm_rs.best_params_, knn_rs.best_params_

# ---------------- main ----------------
def main(argv=None):
    args = parse_args(argv)

    # load X
    Xtr_raw = pd.read_csv(args.x_train)
    Xte_raw = pd.read_csv(args.x_test)

    # labels
    use_derivation = args.derive_labels
    if not use_derivation:
        try:
            ytr_df = pd.read_csv(args.y_train); yte_df = pd.read_csv(args.y_test)
            ytr_col = args.y_train_col or pick_label_col(ytr_df)
            yte_col = args.y_test_col  or pick_label_col(yte_df)
            if (ytr_col is None) or (yte_col is None) or is_date_like(ytr_df[ytr_col]) or is_date_like(yte_df[yte_col]):
                print("[WARN] y_* unusable (likely dates). Deriving labels from X.")
                use_derivation = True
            else:
                ytr = ytr_df[ytr_col]; yte = yte_df[yte_col]
                XtrL, XteL = Xtr_raw.copy(), Xte_raw.copy()
        except Exception:
            print("[WARN] Failed to parse y_*; deriving labels from X.")
            use_derivation = True

    if use_derivation:
        price_col = find_price_col(Xtr_raw, args.price_col)
        if price_col not in Xte_raw.columns:
            raise ValueError(f"Price column '{price_col}' not found in X_test.")
        XtrL, XteL, ytr, yte = derive_labels_from_price(
            Xtr_raw, Xte_raw, price_col=price_col, H=args.horizon, three_class=args.three_class
        )

    # numeric + pruning
    Xtr_num, Xte_num, feature_cols = select_numeric(XtrL, XteL)
    Xtr_num, Xte_num, kept_cols, prune_meta = prune_features_train_only(
        Xtr_num, Xte_num, min_std=args.min_std, corr_thr=args.corr_threshold
    )

    # label encoding if needed
    if ytr.dtype == "O" or yte.dtype == "O":
        le = LabelEncoder()
        ytr = pd.Series(le.fit_transform(ytr), name="y")
        unknown = set(yte.unique()) - set(le.classes_)
        if unknown:
            raise ValueError(f"y_test contains unseen labels vs y_train (e.g., {list(sorted(unknown))[:5]})")
        yte = pd.Series(le.transform(yte), name="y")

    # train & eval (FAST path always on in this script)
    n_classes, svm_metrics, knn_metrics, svm_bp, knn_bp = fit_eval_fast(
        Xtr_num, Xte_num, ytr, yte, kept_cols,
        scoring=args.scoring, outdir=args.outdir,
        cv_type=("timeseries" if args.cv=="timeseries" else "stratified"),
        probs=args.probabilities  # keep False unless needed
    )

    meta = {
        "derived_labels": use_derivation,
        "n_classes": int(n_classes),
        "horizon": args.horizon,
        "three_class": args.three_class,
        "cv": args.cv,
        "scoring": args.scoring or ("f1_macro" if n_classes>2 else "f1"),
        "feature_count_before": len(feature_cols),
        "feature_count_after": len(kept_cols),
        "dropped_low_variance": prune_meta["dropped_low_variance"],
        "dropped_high_corr": prune_meta["dropped_high_corr"],
        "feature_sample": kept_cols[:20],
        "svm": svm_metrics,
        "knn": knn_metrics,
        "svm_best_params": svm_bp,
        "knn_best_params": knn_bp,
    }
    os.makedirs(args.outdir, exist_ok=True)
    with open(os.path.join(args.outdir, "meta.json"), "w") as f:
        json.dump(meta, f, indent=2)
    print("\n[DONE] Results saved in:", os.path.abspath(args.outdir))
    return meta

if __name__ == "__main__":
    main()


[WARN] y_* unusable (likely dates). Deriving labels from X.
[INFO] RandomizedSearchCV (SVM) with 12 samples; scoring=f1
[INFO] RandomizedSearchCV (KNN) with 10 samples; scoring=f1

SVM best_params: {'clf__C': np.float64(0.4352940519041148), 'clf__class_weight': 'balanced', 'clf__coef0': 0.0, 'clf__degree': 3, 'clf__gamma': 0.01, 'clf__kernel': 'poly'}
Accuracy=0.5383  BalancedAcc=0.5502  F1(binary)=0.6081
Confusion matrix:
 [[ 543 1049]
 [ 342 1079]]
Classification report:
               precision    recall  f1-score   support

           0      0.614     0.341     0.438      1592
           1      0.507     0.759     0.608      1421

    accuracy                          0.538      3013
   macro avg      0.560     0.550     0.523      3013
weighted avg      0.563     0.538     0.518      3013


KNN best_params: {'clf__weights': 'uniform', 'clf__p': 1, 'clf__n_neighbors': 11, 'clf__leaf_size': 20}
Accuracy=0.5270  BalancedAcc=0.5255  F1(binary)=0.4984
Confusion matrix:
 [[880 712]
 [71