## Imports & Random Seed

In [47]:
import random as r
r.seed(1)

import numpy as np
np.random.seed(1)

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeClassifier, LogisticRegression

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score as f1s

## Environment

In [26]:
import sklearn
print("sklearn version:", sklearn.__version__)


sklearn version: 1.6.1


## Load Data (train / test / sampleSubmission)

In [27]:
# Paths
train_path  = "/content/train.csv"
test_path   = "/content/test.csv"
sample_path = "/content/sampleSubmission.csv"

# Read CSVs (index_col=0 as suggested in the PDF)
train  = pd.read_csv(train_path, index_col=0)
test   = pd.read_csv(test_path, index_col=0)
sample = pd.read_csv(sample_path)

print("train shape:", train.shape)
print("test shape:", test.shape)
print("sample shape:", sample.shape)
print("sample columns:", sample.columns.tolist())


train shape: (22496, 1795)
test shape: (10656, 1793)
sample shape: (10656, 2)
sample columns: ['ID', 'Predicted']


## Split Features / Label / Groups

In [28]:
# According to the project PDF:
# - first 1793 columns are features
# - column at index -2 is the label
# - column at index -1 is person_id (groups)
X = train.iloc[:, :1793]
y = train.iloc[:, -2].astype(int)
groups = train.iloc[:, -1]

print("X shape:", X.shape)
print("y shape:", y.shape)
print("groups shape:", groups.shape)

print("y dtype:", y.dtype)
print("y classes:", sorted(y.unique()))
print("example person_ids:", groups.head().tolist())


X shape: (22496, 1793)
y shape: (22496,)
groups shape: (22496,)
y dtype: int64
y classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]
example person_ids: [10, 10, 10, 10, 10]


## Model Factory (RidgeClassifier)

In [29]:
def make_ridge(alpha: float = 1.0) -> Pipeline:
    """RidgeClassifier pipeline: median impute -> standardize -> ridge classifier."""
    return Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", RidgeClassifier(alpha=alpha))
    ])


## Model Factory (LogisticRegression)

In [30]:
def make_logreg(C=1.0):
    return Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            C=C,
            solver="lbfgs",
            max_iter=1500,
            random_state=1
        ))
    ])


## Standard 5-Fold CV

In [48]:
def run_standard_cv(model_fn, X, y, n_splits=5, seed=1):
    """
    Standard KFold CV (no grouping).
    Returns:
      - oof_pred: out-of-fold predictions for all samples
      - fold_scores: macro-F1 per fold
      - mean_fold_f1: average macro-F1 over folds
      - oof_f1: macro-F1 over all OOF predictions
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof_pred = np.zeros(len(y), dtype=int)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = model_fn()
        model.fit(X_tr, y_tr)
        pred_va = model.predict(X_va)

        oof_pred[va_idx] = pred_va

        fold_f1 = f1s(y_va, pred_va, average="macro")
        fold_scores.append(fold_f1)
        print(f"[Standard CV] Fold {fold} macro-F1: {fold_f1:.4f}")

    mean_fold_f1 = float(np.mean(fold_scores))
    oof_f1 = float(f1s(y, oof_pred, average="macro"))

    print(f"[Standard CV] Mean macro-F1 (avg over folds): {mean_fold_f1:.4f}")
    print(f"[Standard CV] OOF macro-F1 (all samples): {oof_f1:.4f}")

    return oof_pred, fold_scores, mean_fold_f1, oof_f1


## Group 5-Fold CV

In [49]:
def run_group_cv(model_fn, X, y, groups, n_splits=5):
    """
    GroupKFold CV where groups = person_id.
    Ensures no subject leakage between train/val folds.
    Returns:
      - oof_pred: out-of-fold predictions for all samples
      - fold_scores: macro-F1 per fold
      - mean_fold_f1: average macro-F1 over folds
      - oof_f1: macro-F1 over all OOF predictions
    """
    gkf = GroupKFold(n_splits=n_splits)

    oof_pred = np.zeros(len(y), dtype=int)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), start=1):
        # leakage-proof check (no person_id appears in both train and val)
        assert set(groups.iloc[tr_idx]).isdisjoint(set(groups.iloc[va_idx]))

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = model_fn()
        model.fit(X_tr, y_tr)
        pred_va = model.predict(X_va)

        oof_pred[va_idx] = pred_va

        fold_f1 = f1s(y_va, pred_va, average="macro")
        fold_scores.append(fold_f1)
        print(f"[Group CV] Fold {fold} macro-F1: {fold_f1:.4f}")

    mean_fold_f1 = float(np.mean(fold_scores))
    oof_f1 = float(f1s(y, oof_pred, average="macro"))

    print(f"[Group CV] Mean macro-F1 (avg over folds): {mean_fold_f1:.4f}")
    print(f"[Group CV] OOF macro-F1 (all samples): {oof_f1:.4f}")

    return oof_pred, fold_scores, mean_fold_f1, oof_f1


## OPTIONAL tuning functions (Ridge alpha + LogReg C)

Ridge alpha tuning

In [50]:
def tune_ridge_alpha_groupkfold(X, y, groups, alphas, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    best_alpha, best_score = None, -1.0

    for a in alphas:
        fold_scores = []

        for tr_idx, va_idx in gkf.split(X, y, groups=groups):
            # leakage-proof check
            assert set(groups.iloc[tr_idx]).isdisjoint(set(groups.iloc[va_idx]))

            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            model = make_ridge(alpha=a)
            model.fit(X_tr, y_tr)
            pred_va = model.predict(X_va)

            fold_scores.append(f1s(y_va, pred_va, average="macro"))

        mean_f1 = float(np.mean(fold_scores))
        print(f"alpha={a:<6} mean GroupKFold macro-F1={mean_f1:.4f} folds={[round(s,3) for s in fold_scores]}")

        if mean_f1 > best_score:
            best_score, best_alpha = mean_f1, a

    print(f"\nBEST alpha = {best_alpha}  (mean GroupKFold macro-F1 = {best_score:.4f})")
    return best_alpha, best_score


LogReg C tuning

In [51]:
def tune_logreg_C_groupkfold(X, y, groups, Cs, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    best_C, best_score = None, -1.0

    for C in Cs:
        scores = []
        for tr_idx, va_idx in gkf.split(X, y, groups=groups):
            assert set(groups.iloc[tr_idx]).isdisjoint(set(groups.iloc[va_idx]))

            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            model = make_logreg(C=C)
            model.fit(X_tr, y_tr)
            pred_va = model.predict(X_va)

            scores.append(f1s(y_va, pred_va, average="macro"))

        mean_score = float(np.mean(scores))
        print(f"C={C:<6} mean GroupKFold macro-F1={mean_score:.4f}")

        if mean_score > best_score:
            best_score, best_C = mean_score, C

    print(f"BEST C = {best_C} (mean GroupKFold macro-F1 = {best_score:.4f})")
    return best_C, best_score


## Tuning toggles (OFF by default)

In [35]:
# Toggle tuning ON/OFF
RUN_TUNING = False  # set True only if you want to search alpha

if RUN_TUNING:
    alphas = [0.1, 1.0, 5.0, 10.0, 50.0]
    best_alpha, best_alpha_score = tune_ridge_alpha_groupkfold(X, y, groups, alphas)
else:
    best_alpha = 10.0  # default alpha used when tuning is off

print("RUN_TUNING =", RUN_TUNING, "| best_alpha =", best_alpha)


RUN_TUNING = False | best_alpha = 10.0


In [36]:
RUN_LOGREG_TUNING = False  # set True only when you want to tune

if RUN_LOGREG_TUNING:
    Cs = [0.3, 1.0, 3.0, 10.0]
    best_C, best_C_score = tune_logreg_C_groupkfold(X, y, groups, Cs)
else:
    best_C = 1.0

print("RUN_LOGREG_TUNING =", RUN_LOGREG_TUNING, "| best_C =", best_C)


RUN_LOGREG_TUNING = False | best_C = 1.0


## Standard 5-Fold Cross-Validation — RidgeClassifier

In [37]:
oof_pred_std, fold_scores_std, mean_fold_f1_std, oof_f1_std = run_standard_cv(
    model_fn=lambda: make_ridge(alpha=best_alpha),
    X=X,
    y=y,
    n_splits=5,
    seed=1
)

[Standard CV] Fold 1 macro-F1: 0.7647
[Standard CV] Fold 2 macro-F1: 0.7520
[Standard CV] Fold 3 macro-F1: 0.7458
[Standard CV] Fold 4 macro-F1: 0.7678
[Standard CV] Fold 5 macro-F1: 0.7447
[Standard CV] Mean macro-F1 (avg over folds): 0.7550
[Standard CV] OOF macro-F1 (all samples): 0.7551


## Group 5-Fold Cross-Validation (person_id grouped) — RidgeClassifier

In [38]:
oof_pred_grp, fold_scores_grp, mean_fold_f1_grp, oof_f1_grp = run_group_cv(
    model_fn=lambda: make_ridge(alpha=best_alpha),
    X=X,
    y=y,
    groups=groups,
    n_splits=5
)

[Group CV] Fold 1 macro-F1: 0.2570
[Group CV] Fold 2 macro-F1: 0.2565
[Group CV] Fold 3 macro-F1: 0.2078
[Group CV] Fold 4 macro-F1: 0.2545
[Group CV] Fold 5 macro-F1: 0.3054
[Group CV] Mean macro-F1 (avg over folds): 0.2562
[Group CV] OOF macro-F1 (all samples): 0.2628


## Cross-Validation Summary — RidgeClassifier

In [39]:
print("\n=== Ridge Summary ===")
print(f"Standard 5-Fold: mean macro-F1 = {mean_fold_f1_std:.4f} | OOF macro-F1 = {oof_f1_std:.4f}")
print(f"Group 5-Fold:    mean macro-F1 = {mean_fold_f1_grp:.4f} | OOF macro-F1 = {oof_f1_grp:.4f}")


=== Ridge Summary ===
Standard 5-Fold: mean macro-F1 = 0.7550 | OOF macro-F1 = 0.7551
Group 5-Fold:    mean macro-F1 = 0.2562 | OOF macro-F1 = 0.2628


## Standard 5-Fold Cross-Validation — Logistic Regression

In [40]:
oof_pred_std_lr, fold_scores_std_lr, mean_std_lr, oof_f1_std_lr = run_standard_cv(
    model_fn=lambda: make_logreg(C=best_C),
    X=X,
    y=y,
    n_splits=5,
    seed=1
)

[Standard CV] Fold 1 macro-F1: 0.7509
[Standard CV] Fold 2 macro-F1: 0.7593
[Standard CV] Fold 3 macro-F1: 0.7503
[Standard CV] Fold 4 macro-F1: 0.7698
[Standard CV] Fold 5 macro-F1: 0.7561
[Standard CV] Mean macro-F1 (avg over folds): 0.7573
[Standard CV] OOF macro-F1 (all samples): 0.7573


## Group 5-Fold Cross-Validation (person_id grouped) — Logistic Regression

In [41]:
oof_pred_grp_lr, fold_scores_grp_lr, mean_grp_lr, oof_f1_grp_lr = run_group_cv(
    model_fn=lambda: make_logreg(C=best_C),
    X=X,
    y=y,
    groups=groups,
    n_splits=5
)

[Group CV] Fold 1 macro-F1: 0.2502
[Group CV] Fold 2 macro-F1: 0.2495
[Group CV] Fold 3 macro-F1: 0.1850
[Group CV] Fold 4 macro-F1: 0.2448
[Group CV] Fold 5 macro-F1: 0.2775
[Group CV] Mean macro-F1 (avg over folds): 0.2414
[Group CV] OOF macro-F1 (all samples): 0.2497


## Cross-Validation Summary — Logistic Regression

In [42]:
print("\n=== LogReg Summary ===")
print(f"Standard 5-Fold: mean macro-F1 = {mean_std_lr:.4f} | OOF macro-F1 = {oof_f1_std_lr:.4f}")
print(f"Group 5-Fold:    mean macro-F1 = {mean_grp_lr:.4f} | OOF macro-F1 = {oof_f1_grp_lr:.4f}")



=== LogReg Summary ===
Standard 5-Fold: mean macro-F1 = 0.7573 | OOF macro-F1 = 0.7573
Group 5-Fold:    mean macro-F1 = 0.2414 | OOF macro-F1 = 0.2497


### Model Comparison and Final Selection (Group 5-Fold CV)

In [43]:
print("\n=== Model Comparison (choose by Group 5-Fold mean macro-F1) ===")
print(f"Ridge  (alpha={best_alpha}) | Group mean = {mean_fold_f1_grp:.4f}")
print(f"LogReg (C={best_C})         | Group mean = {mean_grp_lr:.4f}")

if mean_grp_lr > mean_fold_f1_grp:
    best_kind = "logreg"
    print("✅ Selected model: LogReg")
else:
    best_kind = "ridge"
    print("✅ Selected model: Ridge")



=== Model Comparison (choose by Group 5-Fold mean macro-F1) ===
Ridge  (alpha=10.0) | Group mean = 0.2562
LogReg (C=1.0)         | Group mean = 0.2414
✅ Selected model: Ridge


## Final Model Training and Submission (Selected Model)


In [44]:
# --- Choose final model based on comparison ---
if best_kind == "ridge":
    final_model = make_ridge(alpha=best_alpha)
    print(f"Using Ridge final model (alpha={best_alpha})")
else:
    final_model = make_logreg(C=best_C)
    print(f"Using LogReg final model (C={best_C})")

# --- Prepare test features (your test already has 1793 cols, so this is optional but safe) ---
X_test = test.iloc[:, :1793]

# --- Train on ALL training data ---
final_model.fit(X, y)
print("Final model trained on full training set ✅")

# --- Predict Kaggle test ---
final_test_pred = final_model.predict(X_test).astype(int)
print("Final test predictions created ✅")
print("Number of predictions:", len(final_test_pred))
print("Unique predicted classes:", sorted(np.unique(final_test_pred)))
print("First 10 predictions:", final_test_pred[:10])

# --- Build submission using sampleSubmission.csv to guarantee correct ID column ---
final_submission = sample.copy()
final_submission["Predicted"] = final_test_pred

# --- Sanity checks ---
assert list(final_submission.columns) == ["ID", "Predicted"]
assert final_submission.shape == (10656, 2)
assert final_submission["ID"].iloc[0] == 0 and final_submission["ID"].iloc[-1] == 10655
assert final_submission["ID"].is_monotonic_increasing
assert final_submission["ID"].is_unique
assert final_submission["Predicted"].isin([0, 1, 2, 3]).all()

# --- Save file ---
final_submission.to_csv("predictions.csv", index=False)
print("predictions.csv saved ✅")
print(final_submission.head())


Using Ridge final model (alpha=10.0)
Final model trained on full training set ✅
Final test predictions created ✅
Number of predictions: 10656
Unique predicted classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]
First 10 predictions: [1 0 1 0 0 1 1 0 0 1]
predictions.csv saved ✅
   ID  Predicted
0   0          1
1   1          0
2   2          1
3   3          0
4   4          0


In [45]:
check = pd.read_csv("predictions.csv")
print("Shape:", check.shape)
print("Columns:", check.columns.tolist())
print("ID min/max:", check["ID"].min(), check["ID"].max())
print("Unique Predicted:", sorted(check["Predicted"].unique()))


Shape: (10656, 2)
Columns: ['ID', 'Predicted']
ID min/max: 0 10655
Unique Predicted: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]


In [46]:
print("✅ Notebook finished successfully end-to-end.")

✅ Notebook finished successfully end-to-end.
