In [None]:
import pandas as pd
import numpy as np
import gc, os, psutil
from typing import List, Tuple
import lightgbm as lgb
from scipy.stats import pearsonr
from sklearn.model_selection import TimeSeriesSplit
from joblib import dump
import warnings
from src.utils.light_preprocess import Preprocessor
warnings.filterwarnings('ignore', category=UserWarning)

output_folder = '../../outputs/'
seed = 42
n_splits = 5

def mem_mb() -> float:
    """Return current RSS in megabytes."""
    return psutil.Process(os.getpid()).memory_info().rss / 1e6


In [None]:
df_train_raw = pd.read_parquet('../../data/train.parquet')
df_test_raw = pd.read_parquet('../../data/test.parquet')
%store df_train_raw
%store df_test_raw

In [None]:
def variance_filter(
    df: pd.DataFrame, target_col: str = "label", thresh_ratio: float = 0.1
) -> pd.DataFrame:
    """
    Remove feature columns whose variance is < `thresh_ratio` x var(label).

    Args:
        df : DataFrame
            Training frame (label + features).
        thresh_ratio : float
            Keep features whose var >= thresh_ratio * var(label).

    Returns:
        DataFrame
    """
    label_var = df[target_col].var()
    thresh = label_var * thresh_ratio

    keep_cols = [target_col] + [
        c for c in df.columns if c != target_col and df[c].var() >= thresh
    ]
    pruned = df[keep_cols].copy()
    print(f"Variance filter kept {len(keep_cols)-1} of {df.shape[1]-1} columns")
    return pruned

In [None]:
df_train = variance_filter(df_train_raw)

In [None]:
def corr_cluster_select(
    df: pd.DataFrame, target: str = "label", thresh: float = 0.90
) -> pd.DataFrame:
    """
    Target-aware correlation-clustering filter.

    Keeps at most **one** feature from every group of highly-correlated
    columns (|corr| ≥ `thresh`).  For each group the survivor is the
    feature with the strongest absolute correlation to the target.
    Args:
        df : DataFrame
            Input frame that still contains `target`.
        thresh : float
            Absolute Pearson correlation threshold that triggers grouping
            (default 0.90).
    Returns:
        DataFrame
            Same rows, but with duplicates pruned.
    """
    feats = df.drop(columns=[target])
    y = df[target]

    corr = feats.astype("float32").corr().abs()
    mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
    upper = corr.where(mask)

    survivors: List[str] = []
    processed = set()  # columns already clustered

    for col in upper.columns:
        if col in processed:
            continue

        # all features strongly correlated with `col`
        cluster = upper.index[upper[col] >= thresh].tolist()
        cluster.append(col)

        processed.update(cluster)

        # pick the column with largest |corr| to the label
        best = feats[cluster].corrwith(y).abs().idxmax()
        survivors.append(best)

    # deduplicate while preserving order
    survivors = list(dict.fromkeys(survivors))

    kept_df = df[[target] + survivors].copy()
    print(
        f"Correlation filter kept {len(survivors)} "
        f"of {feats.shape[1]} columns (thresh={thresh})"
    )
    return kept_df

In [None]:
df_train = corr_cluster_select(df_train)

In [None]:
%store df_train

In [4]:
def compute_feature_importance(
    df: pd.DataFrame,
    target_col: str = "label",
    n_splits: int = 5,
    seed: int = 42,
    importance_type: str = "gain",  # "gain" or "split",
    var_ratio: float = 0.1,
) -> pd.DataFrame:
    """
    Train LightGBM on each fold, average feature importances, and
    return a ranked table.

    Args:
        df : DataFrame
            Raw training frame (timestamp index, features + label).
        target_col : str
            Column name of the regression target.
        n_splits : int
            TimeSeriesSplit folds.
        importance_type : str
            "gain" - total gain of splits (default, more robust)
            "split" - number of times the feature is used in splits.

    Returns:
        DataFrame with columns [feature, importance] sorted descending.
    """
    y = df[target_col]
    x = df.drop(columns=[target_col])

    tscv = TimeSeriesSplit(n_splits=n_splits)
    imp_accum = pd.Series(0.0, index=x.columns)

    params = dict(
        objective="regression",
        learning_rate=0.1,
        num_leaves=256,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        seed=seed,
        verbose=-1,
    )

    for fold, (tr_idx, val_idx) in enumerate(tscv.split(x), 1):
        print(f"Training fold {fold}/{n_splits} …", end="\r")

        x_tr = x.iloc[tr_idx].astype("float32")
        y_tr = y.iloc[tr_idx]
        x_val = x.iloc[val_idx].astype("float32")
        y_val = y.iloc[val_idx]

        tr_ds = lgb.Dataset(x_tr, y_tr, free_raw_data=False)
        val_ds = lgb.Dataset(x_val, y_val, reference=tr_ds, free_raw_data=False)

        model = lgb.train(
            params,
            tr_ds,
            num_boost_round=1000,
            valid_sets=[val_ds],
            callbacks=[lgb.early_stopping(200)],
        )

        imp_accum += pd.Series(
            model.feature_importance(importance_type=importance_type),
            index=x.columns,
            dtype="float64",
        )

        del model, tr_ds, val_ds, x_tr, x_val  # <── release memory now
        gc.collect()

    imp_df = (
        imp_accum.div(n_splits)
        .sort_values(ascending=False)
        .reset_index()
        .rename(columns={"index": "feature", 0: "importance"})
    )

    return imp_df

In [6]:
# Open this if variance and feature correlation are not used
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

In [None]:
# No need to run this if you already have the feature importance
# from a previous run, just load it instead.
# imp_df = pd.read_csv(f"{output_folder}feature_importance.csv")
imp_df = compute_feature_importance(
    df_train, target_col="label", n_splits=5, var_ratio=0.1
)
imp_df.to_csv(f"{output_folder}feature_importance.csv", index=False)
top_n = 48  # determined by optimize_topn_features with optuna method previously
print(f"Top {top_n} features:\n", imp_df.head(int(top_n)))

Training until validation scores don't improve for 200 rounds


In [None]:
cols_to_expand = imp_df.head(top_n)["feature"].tolist()
train_reduced = df_train[["label", *cols_to_expand, "volume", "sell_qty", "buy_qty"]].copy()
x_train = train_reduced.drop(columns=["label"])
y_train = df_train["label"]
%store train_reduced

In [None]:
# pre = Preprocessor(
#         lag_steps=[1, 5, 30],
#         rolling_windows=[5, 30, 60, 120, 720, 1440],
#         clip_quantiles=(0.001, 0.999),
#         expand_cols=cols_to_expand,
#         aggregate=None,
#     )

# print("Fitting preprocessor …")
# train_feat = pre.fit_transform(train_reduced)
# print(f"After train preprocess: {mem_mb():,.0f} MB")
# y_train = train_feat["label"]
# x_train = train_feat.drop(columns=["label"])
# del train_feat, train_reduced
# gc.collect()

In [None]:
# print("Transforming test …")
# test_reduced = df_test_raw[cols_to_expand].copy()
# test = pre.transform(test_reduced)
# print(f"After test  preprocess: {mem_mb():,.0f} MB")
# gc.collect()

In [None]:
def lgb_pearson(preds, train_data):
    labels = train_data.get_label()
    r = np.corrcoef(labels, preds)[0, 1]          # fast 2-line Pearson
    return 'pearson', r, True                     # True ⇒ higher is better


def train_and_pick_best(x, y, n_splits=5):
    """
    Train LightGBM models using Time Series Cross-Validation and pick the
    best model based on Pearson correlation.
    Args:
        x (DataFrame): Feature DataFrame.
        y (Series): Target variable.
        n_splits (int): Number of splits for Time Series Cross-Validation.
    Returns:
        LightGBM model: The best model based on validation Pearson correlation.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)

    best_model = None
    best_corr = -9
    best_fold_id = None
    best_iter = None

    lgb_params = {
        "objective": "regression",
        "metric": "None",
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "num_leaves": 256,
        "max_depth": 1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "drop_rate": 0.15,
        "skip_drop": 0.5,
        "max_drop": 70,
        "uniform_drop": True,
        "xgboost_dart_mode": True,
        "seed": seed,
        "verbose": -1,
        "num_threads": 4,
    }

    oof_pred = np.full(len(y), np.nan, dtype=np.float32)

    for fold, (train_idx, val_idx) in enumerate(tscv.split(x), 1):
        x_tr = x.iloc[train_idx].astype("float32")
        x_val = x.iloc[val_idx].astype("float32")
        y_tr = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        tr_ds = lgb.Dataset(x_tr, y_tr)
        val_ds = lgb.Dataset(x_val, y_val, reference=tr_ds)

        model = lgb.train(
            lgb_params,
            tr_ds,
            num_boost_round=6000,
            valid_sets=[val_ds],
            feval=lgb_pearson,      # returns ("pearson", r, True)
            callbacks=[
                lgb.early_stopping(stopping_rounds=500, first_metric_only=True),
                lgb.log_evaluation(500),
            ],
        )

        val_pred = model.predict(x_val, num_iteration=model.best_iteration)
        oof_pred[val_idx] = val_pred
        corr, _ = pearsonr(y_val, val_pred)
        print(f"Fold {fold}/{n_splits} - Pearson = {corr:.5f}")

        if corr > best_corr:
            best_corr, best_model = corr, model
            best_iter, best_fold_id = model.best_iteration, fold

        del x_tr, y_tr, x_val, y_val, tr_ds, val_ds
        gc.collect()

    print(f"Best fold: {best_fold_id}  (Pearson {best_corr:.5f}, "
          f"best_iter {best_iter})")
    
    mask = ~np.isnan(oof_pred)                   # safety (should be all True)
    oof_corr = np.corrcoef(y.iloc[mask], oof_pred[mask])[0, 1]
    print(f"Out-of-fold Pearson = {oof_corr:.5f}")

    full_ds = lgb.Dataset(x.astype('float32'), y)
    final_model = lgb.train(
        lgb_params,
        full_ds,
        num_boost_round=best_iter,
        feval=lgb_pearson,
    )
    
    return final_model, best_iter, oof_corr

In [None]:
print("Training model...")
model, best_iter, oof_corr = train_and_pick_best(x_train, y_train, n_splits=n_splits)

In [None]:
def save_submission(test_index, preds, out_path):
    """
    Save predictions to a CSV file for submission.
    Args:
        test_index (Index): Index of the test dataset.
        preds (np.ndarray): Predictions array.
        out_path (str): Path to save the submission file.
    """
    sub = pd.DataFrame({"ID": test_index, "prediction": preds})
    sub = sub.sort_values("ID")
    sub.to_csv(out_path, index=False)
    print(f"Saved predictions to {out_path}")

In [None]:
print("Making predictions...")
test = df_test_raw[cols_to_expand].copy()
preds = model.predict(test, num_iteration=best_iter)

save_submission(test.index, preds, out_path=f"{output_folder}submission_lgbm.csv")

In [None]:
sample = pd.read_csv("../../data/sample_submission.csv")
assert (sample.iloc[:, 0].values == test.index.values).all(), \
       "💥  New test_feat index is NOT identical to sample IDs!"

In [None]:
assert "label" not in test_reduced.columns, "💥  test file still holds labels!"

In [None]:
assert df_train_raw.index.is_monotonic_increasing