In [23]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import GroupKFold, cross_val_score, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from scipy import sparse



In [None]:
from google.colab import drive
drive.mount('/content/drive')
%ls

In [None]:
%cd drive/MyDrive/

# Clean the data and do data engineering

In [24]:


class TSFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Leakage-safe feature engineering for panel time-series-in-a-row.

    Guarantees:
    - Cross-sectional / group-relative features use ONLY training fold data via fit().
    - All within-TS computations are within the same TS snapshot.
    - TS/ALLOCATION are not emitted as model features (avoids identity leakage).
    """

    def __init__(self,
                 ts_col="TS",
                 group_col="GROUP",
                 alloc_col="ALLOCATION",
                 turnover_col="MEDIAN_DAILY_TURNOVER",
                 ret_prefix="RET_",
                 sv_prefix="SIGNED_VOLUME_",
                 horizons=(3, 5, 10, 20)):
        # IMPORTANT: do NOT modify parameters here (sklearn clone requirement)
        self.ts_col = ts_col
        self.group_col = group_col
        self.alloc_col = alloc_col
        self.turnover_col = turnover_col
        self.ret_prefix = ret_prefix
        self.sv_prefix = sv_prefix
        self.horizons = horizons  # store exactly as provided

    # ---------- helpers ----------
    @staticmethod
    def _get_lag_cols(df, prefix):
        cols = [c for c in df.columns if c.startswith(prefix)]

        def _idx(c):
            try:
                return int(c.split("_")[-1])
            except Exception:
                return 10**9

        return sorted(cols, key=_idx)

    @staticmethod
    def _row_sign_flips(arr_1d):
        x = np.asarray(arr_1d, dtype=float)
        x = x[~np.isnan(x)]
        if x.size <= 1:
            return 0
        s = np.sign(x)

        # carry-forward for zeros
        for i in range(1, len(s)):
            if s[i] == 0:
                s[i] = s[i - 1]
        if len(s) > 0 and s[0] == 0:
            nz = s[s != 0]
            if nz.size > 0:
                s[0] = nz[0]
        s = s[s != 0]
        if s.size <= 1:
            return 0
        return int(np.sum(s[1:] != s[:-1]))

    @staticmethod
    def _row_longest_streak(arr_1d, sign=1):
        x = np.asarray(arr_1d, dtype=float)
        x = x[~np.isnan(x)]
        if x.size == 0:
            return 0
        s = np.sign(x)
        target = 1 if sign > 0 else -1
        best = cur = 0
        for v in s:
            if v == target:
                cur += 1
                best = max(best, cur)
            else:
                cur = 0
        return int(best)

    @staticmethod
    def _row_slope(arr_1d):
        x = np.asarray(arr_1d, dtype=float)
        mask = ~np.isnan(x)
        if mask.sum() < 2:
            return np.nan
        y = x[mask]
        t = np.arange(1, y.size + 1, dtype=float)
        t = t - t.mean()
        y = y - y.mean()
        denom = np.sum(t * t)
        if denom == 0:
            return 0.0
        return float(np.sum(t * y) / denom)

    @staticmethod
    def _row_corr(a_1d, b_1d):
        a = np.asarray(a_1d, dtype=float)
        b = np.asarray(b_1d, dtype=float)
        m = ~np.isnan(a) & ~np.isnan(b)
        if m.sum() < 3:
            return np.nan
        aa = a[m]
        bb = b[m]
        sa = np.std(aa)
        sb = np.std(bb)
        if sa == 0 or sb == 0:
            return 0.0
        return float(np.corrcoef(aa, bb)[0, 1])

    @staticmethod
    def _bucketize(x, q1, q2, labels=("low", "mid", "high")):
        out = np.full(x.shape, None, dtype=object)
        m = ~np.isnan(x)
        out[m & (x <= q1)] = labels[0]
        out[m & (x > q1) & (x <= q2)] = labels[1]
        out[m & (x > q2)] = labels[2]
        return out

    # ---------- fit/transform ----------
    def fit(self, X, y=None):
        X = X.copy()

        # normalize horizons here (OK to modify internal fitted attrs)
        self.horizons_ = tuple(sorted(set(self.horizons)))

        self.ret_cols_ = self._get_lag_cols(X, self.ret_prefix)
        self.sv_cols_  = self._get_lag_cols(X, self.sv_prefix)

        self.has_ts_ = self.ts_col in X.columns
        self.has_group_ = self.group_col in X.columns
        self.has_turnover_ = self.turnover_col in X.columns

        feats = self._build_row_features(X)

        # Learn volatility bucket thresholds on train fold
        vol = feats["ret_std_20"].to_numpy(dtype=float)
        vol_clean = vol[~np.isnan(vol)]
        if vol_clean.size >= 10:
            self.vol_q1_, self.vol_q2_ = np.quantile(vol_clean, [0.33, 0.66])
        else:
            self.vol_q1_, self.vol_q2_ = (np.nan, np.nan)

        # Learn turnover bucket thresholds on train fold
        if self.has_turnover_:
            to = feats["turnover"].to_numpy(dtype=float)
            to_clean = to[~np.isnan(to)]
            if to_clean.size >= 10:
                self.to_q1_, self.to_q2_ = np.quantile(to_clean, [0.33, 0.66])
            else:
                self.to_q1_, self.to_q2_ = (np.nan, np.nan)
        else:
            self.to_q1_, self.to_q2_ = (np.nan, np.nan)

        # Learn within-TS / within-(TS,GROUP) medians using ONLY train fold
        if self.has_ts_:
            feats["_TS_KEY"] = X[self.ts_col].astype(str)

            base_cols = ["ret_mean_5", "ret_mean_20", "sv_sum_5", "sv_sum_20", "ret_pos_frac_20"]
            self.ts_medians_ = feats.groupby("_TS_KEY", observed=True)[base_cols].median()

            if self.has_group_:
                feats["_GRP_KEY"] = X[self.group_col].astype(str)
                self.ts_grp_medians_ = feats.groupby(["_TS_KEY", "_GRP_KEY"], observed=True)[base_cols].median()
            else:
                self.ts_grp_medians_ = None
        else:
            self.ts_medians_ = None
            self.ts_grp_medians_ = None

        # global fallback medians from train fold
        self.global_medians_ = feats[["ret_mean_5", "ret_mean_20", "sv_sum_5", "sv_sum_20", "ret_pos_frac_20"]].median(numeric_only=True)

        return self

    def transform(self, X):
        X = X.copy()
        feats = self._build_row_features(X)

        # Buckets using train-fold thresholds
        vol = feats["ret_std_20"].to_numpy(dtype=float)
        if np.isnan(self.vol_q1_) or np.isnan(self.vol_q2_):
            feats["vol_bucket"] = np.where(np.isnan(vol), None, "mid")
        else:
            feats["vol_bucket"] = self._bucketize(vol, self.vol_q1_, self.vol_q2_)

        to = feats["turnover"].to_numpy(dtype=float)
        if np.isnan(self.to_q1_) or np.isnan(self.to_q2_):
            feats["turnover_bucket"] = np.where(np.isnan(to), None, "mid")
        else:
            feats["turnover_bucket"] = self._bucketize(to, self.to_q1_, self.to_q2_)

        # TS-relative and (TS,GROUP)-relative features using ONLY mappings from fit()
        if self.has_ts_ and self.ts_medians_ is not None:
            ts_key = X[self.ts_col].astype(str)

            ts_med = self.ts_medians_.reindex(ts_key).reset_index(drop=True)
            cols = ["ret_mean_5", "ret_mean_20", "sv_sum_5", "sv_sum_20", "ret_pos_frac_20"]
            for col in cols:
                base = ts_med[col].to_numpy(dtype=float)
                m = np.isnan(base)
                if m.any():
                    base[m] = float(self.global_medians_[col])
                feats[f"{col}_minus_ts_median"] = feats[col].to_numpy(dtype=float) - base

            if self.has_group_ and self.ts_grp_medians_ is not None:
                grp_key = X[self.group_col].astype(str)
                idx = pd.MultiIndex.from_arrays([ts_key, grp_key])

                ts_grp_med = self.ts_grp_medians_.reindex(idx).reset_index(drop=True)
                for col in cols:
                    base = ts_grp_med[col].to_numpy(dtype=float)
                    m = np.isnan(base)
                    if m.any():
                        base[m] = float(self.global_medians_[col])
                    feats[f"{col}_minus_ts_group_median"] = feats[col].to_numpy(dtype=float) - base

        # Keep GROUP as categorical (optional but useful)
        if self.has_group_ and self.group_col in X.columns:
            feats["GROUP"] = X[self.group_col].astype(str)

        # Do NOT emit TS or ALLOCATION
        return feats

    def _build_row_features(self, X):
        feats = pd.DataFrame(index=X.index)

        ret_cols = [c for c in getattr(self, "ret_cols_", []) if c in X.columns]
        if not ret_cols:
            ret_cols = self._get_lag_cols(X, self.ret_prefix)
        R = X[ret_cols].to_numpy(dtype=float) if ret_cols else np.empty((len(X), 0))

        sv_cols = [c for c in getattr(self, "sv_cols_", []) if c in X.columns]
        if not sv_cols:
            sv_cols = self._get_lag_cols(X, self.sv_prefix)
        V = X[sv_cols].to_numpy(dtype=float) if sv_cols else np.empty((len(X), 0))

        # 1) Directional stability (returns)
        if R.shape[1] > 0:
            feats["ret_pos_frac_20"] = np.nanmean((R > 0).astype(float), axis=1)
            feats["ret_neg_frac_20"] = np.nanmean((R < 0).astype(float), axis=1)
            feats["ret_flip_count_20"] = [self._row_sign_flips(row) for row in R]
            feats["ret_longest_pos_streak_20"] = [self._row_longest_streak(row, sign=1) for row in R]
            feats["ret_longest_neg_streak_20"] = [self._row_longest_streak(row, sign=-1) for row in R]
        else:
            feats["ret_pos_frac_20"] = np.nan
            feats["ret_neg_frac_20"] = np.nan
            feats["ret_flip_count_20"] = 0
            feats["ret_longest_pos_streak_20"] = 0
            feats["ret_longest_neg_streak_20"] = 0

        # 2) Window aggregates (returns)
        for h in self.horizons_ if hasattr(self, "horizons_") else tuple(sorted(set(self.horizons))):
            if R.shape[1] >= h:
                Rh = R[:, :h]
                feats[f"ret_mean_{h}"] = np.nanmean(Rh, axis=1)
                feats[f"ret_median_{h}"] = np.nanmedian(Rh, axis=1)
                feats[f"ret_std_{h}"] = np.nanstd(Rh, axis=1)
                feats[f"ret_min_{h}"] = np.nanmin(Rh, axis=1)
                feats[f"ret_max_{h}"] = np.nanmax(Rh, axis=1)
                feats[f"ret_slope_{h}"] = [self._row_slope(row) for row in Rh]
            else:
                feats[f"ret_mean_{h}"] = np.nan
                feats[f"ret_median_{h}"] = np.nan
                feats[f"ret_std_{h}"] = np.nan
                feats[f"ret_min_{h}"] = np.nan
                feats[f"ret_max_{h}"] = np.nan
                feats[f"ret_slope_{h}"] = np.nan

        # 3) Vol regime helpers
        feats["ret_mean5_minus_mean20"] = feats.get("ret_mean_5", np.nan) - feats.get("ret_mean_20", np.nan)
        feats["ret_std5_over_std20"] = feats.get("ret_std_5", np.nan) / (feats.get("ret_std_20", np.nan) + 1e-12)
        feats["vol_expansion_flag"] = (feats["ret_std5_over_std20"] > 1.0).astype(float)

        # 4) Signed volume consistency + disagreement
        if V.shape[1] > 0:
            feats["sv_pos_frac_20"] = np.nanmean((V > 0).astype(float), axis=1)
            feats["sv_neg_frac_20"] = np.nanmean((V < 0).astype(float), axis=1)
            feats["sv_flip_count_20"] = [self._row_sign_flips(row) for row in V]
        else:
            feats["sv_pos_frac_20"] = np.nan
            feats["sv_neg_frac_20"] = np.nan
            feats["sv_flip_count_20"] = 0

        for h in self.horizons_ if hasattr(self, "horizons_") else tuple(sorted(set(self.horizons))):
            if V.shape[1] >= h:
                Vh = V[:, :h]
                feats[f"sv_sum_{h}"] = np.nansum(Vh, axis=1)
                feats[f"sv_mean_{h}"] = np.nanmean(Vh, axis=1)
                feats[f"sv_std_{h}"] = np.nanstd(Vh, axis=1)
                feats[f"sv_slope_{h}"] = [self._row_slope(row) for row in Vh]
            else:
                feats[f"sv_sum_{h}"] = np.nan
                feats[f"sv_mean_{h}"] = np.nan
                feats[f"sv_std_{h}"] = np.nan
                feats[f"sv_slope_{h}"] = np.nan

        if R.shape[1] > 0 and V.shape[1] > 0:
            h = min(R.shape[1], V.shape[1], 20)
            feats["flow_return_corr_20"] = [self._row_corr(r, v) for r, v in zip(R[:, :h], V[:, :h])]
        else:
            feats["flow_return_corr_20"] = np.nan

        ret_dir_5 = np.sign(feats.get("ret_mean_5", np.nan).to_numpy(dtype=float))
        sv_dir_5 = np.sign(feats.get("sv_sum_5", np.nan).to_numpy(dtype=float))
        feats["flow_vs_return_disagree_5"] = ((ret_dir_5 * sv_dir_5) < 0).astype(float)

        # 5) Turnover regime (keep NaN + missing flag)
        if self.turnover_col in X.columns:
            to = X[self.turnover_col].to_numpy(dtype=float)
            feats["turnover"] = to
            feats["turnover_missing"] = np.isnan(to).astype(float)
            feats["log_turnover"] = np.log1p(np.where(np.isnan(to), 0.0, np.abs(to)))
        else:
            feats["turnover"] = np.nan
            feats["turnover_missing"] = 1.0
            feats["log_turnover"] = np.nan

        scale = feats["log_turnover"].to_numpy(dtype=float)
        denom = np.where(np.isnan(scale) | (scale <= 0), np.nan, scale)
        feats["sv_sum_20_over_log_turnover"] = feats.get("sv_sum_20", np.nan) / (denom + 1e-12)

        return feats

In [25]:
# -------------------------
# Robust data prep (ROW_ID-safe)
# -------------------------
X_train = pd.read_csv("data/X_train.csv", index_col="ROW_ID")
X_test  = pd.read_csv("data/X_test.csv",  index_col="ROW_ID")
y_train = pd.read_csv("data/y_train.csv", index_col="ROW_ID")

common_train_ids = X_train.index.intersection(y_train.index)
X_train = X_train.loc[common_train_ids].copy()
y_train = y_train.loc[common_train_ids].copy()
assert X_train.index.equals(y_train.index), "ROW_ID mismatch between X_train and y_train after alignment!"

# Binary sign target
y = (y_train.iloc[:, 0] > 0).astype(int)

# Groups by TS (kept ONLY for grouping / within-TS features inside fold)
groups = X_train["TS"].astype(str)

# We keep raw columns for engineering (including TS/GROUP/turnover), but we will NOT pass TS/ALLOCATION through
X_raw = X_train.copy()

# -------------------------
# Pipeline: engineer -> preprocess -> RF
# -------------------------
feat_eng = TSFeatureEngineer(
    ts_col="TS",
    group_col="GROUP",
    alloc_col="ALLOCATION",
    turnover_col="MEDIAN_DAILY_TURNOVER",
    ret_prefix="RET_",
    sv_prefix="SIGNED_VOLUME_",
    horizons=(3, 5, 10, 20)
)

In [26]:
!nvidia-smi
!cat /proc/meminfo | head -n 5
!cat /etc/issue

zsh:1: command not found: nvidia-smi
cat: /proc/meminfo: No such file or directory
cat: /etc/issue: No such file or directory


In [27]:
X_train.shape

(527073, 44)

In [28]:
X_test.shape


(31870, 44)

# Random forest and LightGbm

In [29]:
#BASELINE ACCURACY
majority_acc = max(y.mean(), 1 - y.mean())
print(f"Majority-class baseline accuracy: {majority_acc:.6f}")

Majority-class baseline accuracy: 0.507184


# Random Forest

In [None]:
from sklearn.compose import make_column_selector as selector

preprocess = ColumnTransformer(
    transformers=[
        ("num",
         Pipeline(steps=[
             # Tree models can't take NaNs in sklearn RF; keep imputation inside pipeline (no leakage).
             ("imp", SimpleImputer(strategy="median")),
         ]),
         selector(dtype_include=np.number)
        ),
        ("cat",
         Pipeline(steps=[
             ("imp", SimpleImputer(strategy="most_frequent")),
             ("oh", OneHotEncoder(handle_unknown="ignore"))
         ]),
         selector(dtype_exclude=np.number)
        ),
    ],
    remainder="drop"
)

rf = RandomForestClassifier(
    n_estimators=100,
    n_jobs=6,
    bootstrap=True,
    max_samples=200_000,   # keep if you have enough rows; otherwise set None
    max_depth=20,
    min_samples_leaf=20,
    max_features="sqrt",
    random_state=42,
)

pipe = Pipeline([
    ("feats", feat_eng),
    ("prep", preprocess),
    ("rf", rf),
])

# -------------------------
# Leakage-safe GroupKFold on TS
# -------------------------
gkf = GroupKFold(n_splits=5)
scores = cross_val_score(
    pipe,
    X_raw,
    y,
    cv=gkf,
    groups=groups,
    scoring="accuracy",
    n_jobs=1
)

print("Random Forest GroupKFold-TS accuracy (with engineered features):")
print("folds:", scores)
print("mean :", scores.mean())
print("std  :", scores.std(ddof=1))

# Now lightGBM

In [30]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

In [31]:
# -------------------------
# Preprocess
# -------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("num",
         Pipeline(steps=[
             ("imp", SimpleImputer(strategy="median")),
         ]),
         selector(dtype_include=np.number)
        ),
        ("cat",
         Pipeline(steps=[
             ("imp", SimpleImputer(strategy="most_frequent")),
             ("oh", OneHotEncoder(handle_unknown="ignore")),
         ]),
         selector(dtype_exclude=np.number)
        ),
    ],
    remainder="drop"
)

In [34]:
def ts_cv_accuracy_lgbm(
    X_raw, y, groups,
    feat_eng, preprocess,
    lgbm_params,
    n_splits=3,
    early_stopping_rounds=100,
    verbose_eval=False,
    random_state=42
):
    gkf = GroupKFold(n_splits=n_splits)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_raw, y, groups=groups), start=1):
        X_tr_raw = X_raw.iloc[tr_idx]
        y_tr = y.iloc[tr_idx]
        X_va_raw = X_raw.iloc[va_idx]
        y_va = y.iloc[va_idx]

        # 1) Feature engineering (fit on train only)
        fe = clone(feat_eng)
        X_tr_fe = fe.fit_transform(X_tr_raw, y_tr)
        X_va_fe = fe.transform(X_va_raw)

        # 2) Preprocess (fit on train only)
        pp = clone(preprocess)
        X_tr_mat = pp.fit_transform(X_tr_fe, y_tr)
        X_va_mat = pp.transform(X_va_fe)

        # 3) Build identical feature names from fitted preprocess
        feat_names = pp.get_feature_names_out()
        if X_tr_mat.shape[1] != len(feat_names) or X_va_mat.shape[1] != len(feat_names):
            raise ValueError(
                f"Feature mismatch: X_tr {X_tr_mat.shape[1]}, X_va {X_va_mat.shape[1]}, names {len(feat_names)}"
            )

        # 4) Wrap matrices into DataFrames with identical columns (guaranteed alignment)
        # Note: if matrices are sparse, pandas stores them as object; convert to dense if needed.
        # For large data, it's usually better to keep sparse; instead, we can just satisfy the name check
        # by setting feature_names_in_ via fitting on DataFrame.
        X_tr = pd.DataFrame(X_tr_mat.toarray() if hasattr(X_tr_mat, "toarray") else X_tr_mat, columns=feat_names)
        X_va = pd.DataFrame(X_va_mat.toarray() if hasattr(X_va_mat, "toarray") else X_va_mat, columns=feat_names)

        model = LGBMClassifier(**lgbm_params, random_state=random_state, n_jobs=6)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="binary_error",
            callbacks=[
                early_stopping(stopping_rounds=early_stopping_rounds, verbose=False),
                log_evaluation(period=50 if verbose_eval else 0),
            ],
        )

        y_pred = model.predict(X_va)
        acc = accuracy_score(y_va, y_pred)
        fold_scores.append(acc)

        if verbose_eval:
            print(f"Fold {fold}: acc={acc:.6f}, best_iter={getattr(model, 'best_iteration_', None)}")

    return np.array(fold_scores, dtype=float)

# Coarse search (3-fold TS CV): num_leaves, min_data_in_leaf, max_depth

In [42]:
import json

base_params = dict(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=2000,          # coarse/confirm budget
    learning_rate=0.05,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    verbosity=-1,
    force_row_wise=True,
)

coarse_structures = [
    dict(num_leaves=63,  max_depth=8,  min_child_samples=50),
    dict(num_leaves=127, max_depth=10, min_child_samples=50),
    dict(num_leaves=255, max_depth=-1, min_child_samples=100),

    dict(num_leaves=63,  max_depth=8,  min_child_samples=200),
    dict(num_leaves=127, max_depth=10, min_child_samples=200),
    dict(num_leaves=127, max_depth=-1, min_child_samples=300),
    dict(num_leaves=255, max_depth=-1, min_child_samples=500),

    dict(num_leaves=31,  max_depth=6,  min_child_samples=1000),
    dict(num_leaves=31,  max_depth=6,  min_child_samples=2000),

    dict(num_leaves=63,  max_depth=8,  min_child_samples=500),
    dict(num_leaves=63,  max_depth=8,  min_child_samples=1000),

    dict(num_leaves=127, max_depth=10, min_child_samples=500),
    dict(num_leaves=127, max_depth=10, min_child_samples=1000),

    dict(num_leaves=255, max_depth=-1, min_child_samples=1000),
    dict(num_leaves=255, max_depth=-1, min_child_samples=2000),

    dict(num_leaves=63,  max_depth=-1, min_child_samples=1000),
    dict(num_leaves=127, max_depth=-1, min_child_samples=2000),
]

# ---- 1) Cheap coarse screen (2-fold TS) ----
coarse_scores = []
for cfg in coarse_structures:
    params = {**base_params, **cfg}
    scores = ts_cv_accuracy_lgbm(
        X_raw=X_raw, y=y, groups=groups,
        feat_eng=feat_eng, preprocess=preprocess,
        lgbm_params=params,
        n_splits=2,
        early_stopping_rounds=150,
        verbose_eval=False,
    )
    coarse_scores.append((float(scores.mean()), cfg, scores))

coarse_scores.sort(key=lambda x: x[0], reverse=True)
top2_coarse = coarse_scores[:2]

print("\n=== Coarse screen TOP-2 (2-fold TS) ===")
for rank, (m, cfg, sc) in enumerate(top2_coarse, start=1):
    print(f"\n#{rank} cfg={cfg}")
    print("folds:", sc, "mean:", sc.mean(), "std:", sc.std(ddof=1))

# ---- 2) Confirm top-2 with 3-fold TS ----
confirmed = []
for _, cfg, _ in top2_coarse:
    params = {**base_params, **cfg}
    scores = ts_cv_accuracy_lgbm(
        X_raw=X_raw, y=y, groups=groups,
        feat_eng=feat_eng, preprocess=preprocess,
        lgbm_params=params,
        n_splits=3,
        early_stopping_rounds=200,
        verbose_eval=False,
    )
    confirmed.append((float(scores.mean()), cfg, scores))

confirmed.sort(key=lambda x: x[0], reverse=True)
locked_structure = confirmed[0][1]

print("\n=== Confirmed best structure (3-fold TS) ===")
print("locked_structure:", locked_structure)
print("folds:", confirmed[0][2], "mean:", confirmed[0][2].mean(), "std:", confirmed[0][2].std(ddof=1))

# ---- 3) Save results (optional but recommended) ----
payload = {
    "top2_coarse": [
        {"rank": i+1, "mean": m, "cfg": cfg, "folds": sc.tolist()}
        for i, (m, cfg, sc) in enumerate(top2_coarse)
    ],
    "top2_confirmed": [
        {"rank": i+1, "mean": m, "cfg": cfg, "folds": sc.tolist()}
        for i, (m, cfg, sc) in enumerate(confirmed[:2])
    ],
    "locked_structure": locked_structure,
}

with open("structure_selection_results.json", "w") as f:
    json.dump(payload, f, indent=2)

print("\nSaved: structure_selection_results.json")


=== Coarse screen TOP-2 (2-fold TS) ===

#1 cfg={'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 1000}
folds: [0.52114856 0.52199735] mean: 0.5215729573354335 std: 0.0006001834658850511

#2 cfg={'num_leaves': 63, 'max_depth': 8, 'min_child_samples': 50}
folds: [0.52030247 0.5209082 ] mean: 0.5206053367516714 std: 0.00042831935747832654

=== Confirmed best structure (3-fold TS) ===
locked_structure: {'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 1000}
folds: [0.52167999 0.51710988 0.52336177] mean: 0.5207172117071036 std: 0.0032352360610674244

Saved: structure_selection_results.json


# Lock structure; fine-tune learning rate + regularization (3-fold TS CV)

In [45]:
locked_structure = confirmed[0][1]

base_params["n_estimators"] = 5000  #fine_tune budget
fine_base_params = dict(
    **base_params,
)

fine_grid = {
    "learning_rate": [0.02, 0.03, 0.05],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.8, 0.9],
    "reg_alpha": [0.0, 0.5, 1.0],
    "reg_lambda": [0.0, 1.0, 2.0, 4.0, 8.0],
}

best2 = None
best2_mean = -np.inf

for cfg in ParameterGrid(fine_grid):
    params = {**fine_base_params, **locked_structure, **cfg}
    scores = ts_cv_accuracy_lgbm(
        X_raw=X_raw, y=y, groups=groups,
        feat_eng=feat_eng, preprocess=preprocess,
        lgbm_params=params,
        n_splits=3,
        early_stopping_rounds=200,
        verbose_eval=False,
    )
    mean_acc = float(scores.mean())
    if mean_acc > best2_mean:
        best2_mean = mean_acc
        best2 = (cfg, scores)
    print(mean_acc)

best_params = {**fine_base_params, **locked_structure, **best2[0]}

print("\n=== Fine-tune best (3-fold TS) ===")
print("locked_structure:", locked_structure)
print("best_fine_cfg  :", best2[0])
print("folds          :", best2[1])
print("mean           :", best2[1].mean(), "std:", best2[1].std(ddof=1))


=== Fine-tune best (3-fold TS) ===
locked_structure: {'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 1000}
best_fine_cfg  : {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'reg_alpha': 0.0, 'reg_lambda': 8.0, 'subsample': 0.9}
folds          : [0.52195888 0.51925003 0.52340162]
mean           : 0.5215368440196885 std: 0.002107722458236562


# Final evaluation: 5-fold TS CV with the tuned params

In [47]:
final_scores = ts_cv_accuracy_lgbm(
    X_raw=X_raw, y=y, groups=groups,
    feat_eng=feat_eng, preprocess=preprocess,
    lgbm_params=best_params,
    n_splits=5,
    early_stopping_rounds=200,
    verbose_eval=False,
)

print("\n=== FINAL 5-fold TS CV (LightGBM tuned) ===")
print("folds:", final_scores)
print("mean :", final_scores.mean())
print("std  :", final_scores.std(ddof=1))

print("\nBest LightGBM params used:")
print(best_params)


=== FINAL 5-fold TS CV (LightGBM tuned) ===
folds: [0.51780632 0.52572762 0.51934771 0.52607714 0.52287086]
mean : 0.5223659298074346
std  : 0.0037158003627439052

Best LightGBM params used:
{'objective': 'binary', 'boosting_type': 'gbdt', 'n_estimators': 5000, 'learning_rate': 0.03, 'subsample': 0.9, 'subsample_freq': 1, 'colsample_bytree': 0.9, 'reg_alpha': 0.0, 'reg_lambda': 8.0, 'verbosity': -1, 'force_row_wise': True, 'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 1000}


# predictions with model of my choice

In [None]:

import os
# -------------------------
# 1) Fit FE on full train, transform train/test
# -------------------------
fe_final = clone(feat_eng)
X_tr_fe = fe_final.fit_transform(X_raw, y)     # fit only on train
X_te_fe = fe_final.transform(X_test)           # transform test

# -------------------------
# 2) Fit preprocess on full train, transform train/test
# -------------------------
pp_final = clone(preprocess)
X_tr_mat = pp_final.fit_transform(X_tr_fe, y)
X_te_mat = pp_final.transform(X_te_fe)

feat_names = pp_final.get_feature_names_out()

# Convert to DataFrames with consistent columns
X_tr = pd.DataFrame(
    X_tr_mat.toarray() if hasattr(X_tr_mat, "toarray") else X_tr_mat,
    columns=feat_names,
    index=X_raw.index
)
X_te = pd.DataFrame(
    X_te_mat.toarray() if hasattr(X_te_mat, "toarray") else X_te_mat,
    columns=feat_names,
    index=X_test.index
)

# -------------------------
# 3) Train final model on full training set
# -------------------------
final_model = LGBMClassifier(**best_params, random_state=42, n_jobs=6)

final_model.fit(X_tr, y)

# -------------------------
# 4) Predict on X_test
# -------------------------
proba_test = final_model.predict_proba(X_te)[:, 1]
pred_test  = (proba_test >= 0.5).astype(int)   # or use final_model.predict(X_te)

# Save with ROW_ID index
pred_df = pd.DataFrame(
    {"pred": pred_test},
    index=X_test.index
)
pred_df.index.name = "ROW_ID"

os.makedirs("submission", exist_ok=True)
pred_df.to_csv("submission/lgbm_test_predictions.csv")
print(pred_df.head())

        pred
ROW_ID      
527073     1
527074     0
527075     0
527076     1
527077     0
