In [4]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [5]:
data_dir = Path("../data")
proc_dir = data_dir / "processed"
raw_test = data_dir / "test.csv" # to get the passenger ids
sub_dir = Path("../submissions")
sub_dir.mkdir(parents=True, exist_ok=True)

In [6]:
feature_sets = ["naive", "basic", "enhanced"]
models = ["logreg", "rf", "lgbm"]
random_state = 42
n_splits = 5

In [7]:
def load_features(name: str):
    train = pd.read_csv(proc_dir / f"train_{name}.csv")
    test = pd.read_csv(proc_dir / f"test_{name}.csv")
    return train, test

In [8]:
def split_xy(df: pd.DataFrame):
    y = df["Transported"].astype(int).values
    X = df.drop(columns=["Transported"])
    return X, y

In [9]:
def get_model(name: str):
    if name == "logreg":
        # logistic regression will be scaled inside CV loop
        return "logreg"
    if name == "rf":
        return RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=random_state,
            n_jobs=-1
        )
    if name == "lgbm":
        return lgb.LGBMClassifier(
            n_estimators=1000,
            learning_rate=0.03,
            max_depth=-1,
            num_leaves=31,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=random_state,
            objective="binary",
            n_jobs=-1
        )
    raise ValueError(f"Unknown model: {name}")

In [10]:
def cv_score_model(X, y, model_name: str, n_splits=n_splits, seed=random_state):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    accs, aucs = [], []
    oof = np.zeros(len(X), dtype=float)
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        if model_name == "logreg":
            scaler = StandardScaler(with_mean=False)
            X_train_s = scaler.fit_transform(X_train)
            X_val_s = scaler.fit_transform(X_val)
            clf = LogisticRegression(max_iter=2000, n_jobs=None, random_state=seed)
            clf.fit(X_train_s, y_train)
            prob_val = clf.predict_proba(X_val_s)[:, 1]
        else:
            clf = get_model(model_name)
            clf.fit(X_train, y_train)
            prob_val = clf.predict_proba(X_val)[:, 1]
        oof[val_idx] = prob_val
        pred_val = (prob_val >= 0.5).astype(int)
        accs.append(accuracy_score(y_val, pred_val))
        aucs.append(roc_auc_score(y_val, prob_val))
    return {
        "acc_mean": float(np.mean(accs)),
        "acc_std": float(np.std(accs)),
        "auc_mean": float(np.mean(aucs)),
        "auc_std": float(np.std(aucs)),
        "oof": oof
    }

In [11]:
def fit_full_and_predict(X, y, X_test, model_name: str):
    if model_name == "logreg":
        scaler = StandardScaler(with_mean=False)
        X_full = scaler.fit_transform(X)
        Xt = scaler.transform(X_test)
        clf = LogisticRegression(max_iter=2000, n_jobs=None, random_state=random_state)
        clf.fit(X_full, y)
        prob_test = clf.predict_proba(Xt)[:, 1]
    else:
        clf = get_model(model_name)
        clf.fit(X, y)
        prob_test = clf.predict_proba(X_test)[:, 1]
    return prob_test

In [13]:
raw_test = pd.read_csv(raw_test) # for passenger id
sub_summary = []

for fs in feature_sets:
    print(f"\n=== Feature set: {fs} ===")
    train_df, test_df = load_features(fs)
    X, y = split_xy(train_df)
    X_test = test_df.copy() # keep a copy of the test feature frame
    # evaluate all models via CV
    for m in models:
        scores = cv_score_model(X, y, m)
        print(f"{m:>6} | ACC {scores["acc_mean"]:.4f} +/- {scores["acc_std"]:.4f} "
              f"| AUC {scores["auc_mean"]:.4f} +/- {scores["auc_std"]:.4f}")
        # train on full dataset and predict test for submission
        prob_test = fit_full_and_predict(X, y, X_test, m)
        pred_test = (prob_test >= 0.5)
        # submission file
        sub = pd.DataFrame({
            "PassengerId": raw_test["PassengerId"],
            "Transported": pred_test.astype(bool)
        })
        out_path = sub_dir / f"{fs}_{m}.csv"
        sub.to_csv(out_path, index=False)
        sub_summary.append({
            "featureset": fs,
            "model": m,
            "acc_mean": scores["acc_mean"],
            "auc_mean": scores["auc_mean"],
            "submission": str(out_path)
        })

results_df = pd.DataFrame(sub_summary).sort_values(["acc_mean", "auc_mean"], ascending=False)
results_df


=== Feature set: naive ===
logreg | ACC 0.7857 +/- 0.0121 | AUC 0.8706 +/- 0.0065
    rf | ACC 0.7850 +/- 0.0079 | AUC 0.8614 +/- 0.0062
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

Unnamed: 0,featureset,model,acc_mean,auc_mean,submission
5,basic,lgbm,0.809502,0.901853,../submissions/basic_lgbm.csv
8,enhanced,lgbm,0.80214,0.894103,../submissions/enhanced_lgbm.csv
4,basic,rf,0.800989,0.88517,../submissions/basic_rf.csv
6,enhanced,logreg,0.792132,0.871954,../submissions/enhanced_logreg.csv
7,enhanced,rf,0.79098,0.877696,../submissions/enhanced_rf.csv
3,basic,logreg,0.789946,0.879032,../submissions/basic_logreg.csv
2,naive,lgbm,0.786723,0.868103,../submissions/naive_lgbm.csv
0,naive,logreg,0.785689,0.870605,../submissions/naive_logreg.csv
1,naive,rf,0.784999,0.861376,../submissions/naive_rf.csv
