In [16]:
import pandas as pd
import numpy as np
import gc, os, psutil
from typing import List, Tuple
import lightgbm as lgb
from scipy.stats import pearsonr
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoostRegressor, Pool
from joblib import dump
import warnings
from src.utils.light_preprocess import Preprocessor
warnings.filterwarnings('ignore', category=UserWarning)

input_folder = '../../data/'
output_folder = '../../outputs/'
seed = 42
n_splits = 5

def mem_mb() -> float:
    """Return current RSS in megabytes."""
    return psutil.Process(os.getpid()).memory_info().rss / 1e6


In [2]:
df_train_raw = pd.read_parquet(f'{input_folder}train.parquet')
df_test_raw = pd.read_parquet(f'{input_folder}test.parquet')
%store df_train_raw
%store df_test_raw

Stored 'df_train_raw' (DataFrame)
Stored 'df_test_raw' (DataFrame)


In [3]:
def variance_filter(
    df: pd.DataFrame, target_col: str = "label", thresh_ratio: float = 0.1
) -> pd.DataFrame:
    """
    Remove feature columns whose variance is < `thresh_ratio` x var(label).

    Args:
        df : DataFrame
            Training frame (label + features).
        thresh_ratio : float
            Keep features whose var >= thresh_ratio * var(label).

    Returns:
        DataFrame
    """
    label_var = df[target_col].var()
    thresh = label_var * thresh_ratio

    keep_cols = [target_col] + [
        c for c in df.columns if c != target_col and df[c].var() >= thresh
    ]
    pruned = df[keep_cols].copy()
    print(f"Variance filter kept {len(keep_cols)-1} of {df.shape[1]-1} columns")
    return pruned

In [4]:
df_train = variance_filter(df_train_raw)

  sqr = _ensure_numeric((avg - values) ** 2)


Variance filter kept 855 of 895 columns


In [5]:
def corr_cluster_select(
    df: pd.DataFrame, target: str = "label", thresh: float = 0.90
) -> pd.DataFrame:
    """
    Target-aware correlation-clustering filter.

    Keeps at most **one** feature from every group of highly-correlated
    columns (|corr| ≥ `thresh`).  For each group the survivor is the
    feature with the strongest absolute correlation to the target.
    Args:
        df : DataFrame
            Input frame that still contains `target`.
        thresh : float
            Absolute Pearson correlation threshold that triggers grouping
            (default 0.90).
    Returns:
        DataFrame
            Same rows, but with duplicates pruned.
    """
    feats = df.drop(columns=[target])
    y = df[target]

    corr = feats.astype("float32").corr().abs()
    mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
    upper = corr.where(mask)

    survivors: List[str] = []
    processed = set()  # columns already clustered

    for col in upper.columns:
        if col in processed:
            continue

        # all features strongly correlated with `col`
        cluster = upper.index[upper[col] >= thresh].tolist()
        cluster.append(col)

        processed.update(cluster)

        # pick the column with largest |corr| to the label
        best = feats[cluster].corrwith(y).abs().idxmax()
        survivors.append(best)

    # deduplicate while preserving order
    survivors = list(dict.fromkeys(survivors))

    kept_df = df[[target] + survivors].copy()
    print(
        f"Correlation filter kept {len(survivors)} "
        f"of {feats.shape[1]} columns (thresh={thresh})"
    )
    return kept_df

In [6]:
df_train = corr_cluster_select(df_train)

Correlation filter kept 630 of 855 columns (thresh=0.9)


In [7]:
%store df_train

Stored 'df_train' (DataFrame)


In [8]:
def compute_feature_importance(
    df: pd.DataFrame,
    target_col: str = "label",
    n_splits: int = 5,
    seed: int = 42,
    importance_type: str = "gain",  # "gain" or "split",
    var_ratio: float = 0.1,
) -> pd.DataFrame:
    """
    Train LightGBM on each fold, average feature importances, and
    return a ranked table.

    Args:
        df : DataFrame
            Raw training frame (timestamp index, features + label).
        target_col : str
            Column name of the regression target.
        n_splits : int
            TimeSeriesSplit folds.
        importance_type : str
            "gain" - total gain of splits (default, more robust)
            "split" - number of times the feature is used in splits.

    Returns:
        DataFrame with columns [feature, importance] sorted descending.
    """
    y = df[target_col]
    x = df.drop(columns=[target_col])

    tscv = TimeSeriesSplit(n_splits=n_splits)
    imp_accum = pd.Series(0.0, index=x.columns)

    params = dict(
        objective="regression",
        learning_rate=0.1,
        num_leaves=256,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        seed=seed,
        verbose=-1,
    )

    for fold, (tr_idx, val_idx) in enumerate(tscv.split(x), 1):
        print(f"Training fold {fold}/{n_splits} …", end="\r")

        x_tr = x.iloc[tr_idx].astype("float32")
        y_tr = y.iloc[tr_idx]
        x_val = x.iloc[val_idx].astype("float32")
        y_val = y.iloc[val_idx]

        tr_ds = lgb.Dataset(x_tr, y_tr, free_raw_data=False)
        val_ds = lgb.Dataset(x_val, y_val, reference=tr_ds, free_raw_data=False)

        model = lgb.train(
            params,
            tr_ds,
            num_boost_round=1000,
            valid_sets=[val_ds],
            callbacks=[lgb.early_stopping(200)],
        )

        imp_accum += pd.Series(
            model.feature_importance(importance_type=importance_type),
            index=x.columns,
            dtype="float64",
        )

        del model, tr_ds, val_ds, x_tr, x_val  # <── release memory now
        gc.collect()

    imp_df = (
        imp_accum.div(n_splits)
        .sort_values(ascending=False)
        .reset_index()
        .rename(columns={"index": "feature", 0: "importance"})
    )

    return imp_df

In [9]:
# # Open this if variance and feature correlation are not used
# df_train = df_train_raw.copy()
# df_test = df_test_raw.copy()

In [10]:
# No need to run this if you already have the feature importance
# from a previous run, just load it instead.
# imp_df = pd.read_csv(f"{output_folder}feature_importance.csv")
imp_df = compute_feature_importance(
    df_train, target_col="label", n_splits=5, var_ratio=0.1
)
imp_df.to_csv(f"{output_folder}feature_importance.csv", index=False)
top_n = 48  # determined by optimize_topn_features with optuna method previously
print(f"Top {top_n} features:\n", imp_df.head(int(top_n)))

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 0.960421
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.02821
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.04097
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 0.943593
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.14101
Top 48 features:
    feature   importance
0     X686  2562.972731
1     X748  2480.580946
2     X466  2360.977019
3     X862  2200.282824
4     X885  2184.692510
5     X873  1933.185632
6     X344  1620.960222
7     X537  1586.908585
8      X86  1512.748166
9     X752  1469.943217
10    X786  1427.386214
11    X272  1411.428012
12    X465  1407.638287
13    X198  1362.122412
14   

In [11]:
cols_to_expand = imp_df.head(top_n)["feature"].tolist()
train_reduced = df_train_raw[["label", *cols_to_expand, "volume", "sell_qty", "buy_qty"]].copy()
test_reduced = df_test_raw[[*cols_to_expand, "volume", "sell_qty", "buy_qty"]].copy()
# x_train = train_reduced.drop(columns=["label"])
# y_train = df_train["label"]
%store train_reduced
%store test_reduced

Stored 'train_reduced' (DataFrame)
Stored 'test_reduced' (DataFrame)


In [12]:
def save_submission(preds, sample_path=f"{input_folder}sample_submission.csv",
                    out_path=f"{output_folder}submission_cat.csv"):
    sub = pd.read_csv(sample_path)
    assert len(sub) == len(preds)
    sub[sub.columns[-1]] = preds.astype(np.float32)
    sub.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")

In [18]:
feature_cols = [c for c in train_reduced.columns if c not in ["label"]]
cat_cols = []      # or [] if not present

x_train = train_reduced[feature_cols]
y_train = train_reduced["label"]
x_test  = test_reduced.copy()

print("Train shape:", x_train.shape, "Test shape:", x_test.shape,
      "RAM:", mem_mb(), "MB")

Train shape: (525887, 51) Test shape: (538150, 51) RAM: 2151.890944 MB


In [20]:
tscv = TimeSeriesSplit(n_splits=5)
oof_pred = np.full(len(y_train), np.nan, dtype=np.float32)
models   = []

for fold, (tr_idx, val_idx) in enumerate(tscv.split(x_train), 1):
    x_tr, y_tr = x_train.iloc[tr_idx], y_train.iloc[tr_idx]
    x_val, y_val = x_train.iloc[val_idx], y_train.iloc[val_idx]

    train_pool = Pool(x_tr, y_tr)       # cat_cols empty → fine
    val_pool   = Pool(x_val, y_val)

    model = CatBoostRegressor(
        iterations=8000,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=3,
        loss_function="RMSE",           # ← built-in
        eval_metric="RMSE",
        random_seed=42 + fold,
        od_type="Iter",
        od_wait=300,                    # early-stop patience
        verbose=400,
        task_type="CPU"
    )
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    pred_val = model.predict(val_pool)
    oof_pred[val_idx] = pred_val

    r_fold = pearsonr(y_val, pred_val)[0]
    print(f"fold {fold}: Pearson = {r_fold:.5f}")

    models.append(model)
    del train_pool, val_pool, x_tr, x_val
    gc.collect()

mask = ~np.isnan(oof_pred)
oof_r = pearsonr(y_train.iloc[mask], oof_pred[mask])[0]
print(f"\nOOF Pearson (whole series) = {oof_r:.5f}")


0:	learn: 1.0121026	test: 0.9796283	best: 0.9796283 (0)	total: 83.9ms	remaining: 11m 11s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.9796282856
bestIteration = 0

Shrink model to first 1 iterations.
fold 1: Pearson = 0.00236
0:	learn: 0.9955341	test: 1.0054654	best: 1.0054654 (0)	total: 22.5ms	remaining: 2m 59s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 1.005465418
bestIteration = 0

Shrink model to first 1 iterations.
fold 2: Pearson = 0.02352
0:	learn: 0.9973528	test: 1.0098668	best: 1.0098668 (0)	total: 22.9ms	remaining: 3m 3s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 1.008785554
bestIteration = 3

Shrink model to first 4 iterations.
fold 3: Pearson = 0.03324
0:	learn: 0.9996072	test: 0.9753651	best: 0.9753651 (0)	total: 27.2ms	remaining: 3m 37s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.9739470073
bestIteration = 8

Shrink model to first 9 iterations.
fold 4: Pearson = 0.08467
0:	lea

In [21]:
best_iters = int(np.mean([m.get_best_iteration() for m in models]))
full_pool = Pool(x_train, y_train)

full_model = CatBoostRegressor(
    iterations=best_iters,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function="RMSE",
    random_seed=999,
    verbose=False,
    task_type="CPU"
)
full_model.fit(full_pool)

preds = full_model.predict(Pool(x_test))


In [22]:
sample = pd.read_csv(f"{input_folder}sample_submission.csv")
sample[sample.columns[-1]] = preds.astype(np.float32)
sample.to_csv(f"{output_folder}submission_cat.csv", index=False)
print("Saved submission_cat.csv")

Saved submission_cat.csv
