In [17]:
import pandas as pd
import numpy as np
import gc, os, psutil
from typing import List, Tuple
import lightgbm as lgb
from scipy.stats import pearsonr
from sklearn.model_selection import TimeSeriesSplit
from joblib import dump
import warnings
from src.utils.light_preprocess import Preprocessor
warnings.filterwarnings('ignore', category=UserWarning)

input_folder = '../../data/'
output_folder = '../../outputs/'
seed = 42
n_splits = 5

def mem_mb() -> float:
    """Return current RSS in megabytes."""
    return psutil.Process(os.getpid()).memory_info().rss / 1e6


In [None]:
df_train_raw = pd.read_parquet(f'{input_folder}train.parquet')
df_test_raw = pd.read_parquet(f'{input_folder}test.parquet')
%store df_train_raw
%store df_test_raw

Stored 'df_train_raw' (DataFrame)
Stored 'df_test_raw' (DataFrame)


In [3]:
def variance_filter(
    df: pd.DataFrame, target_col: str = "label", thresh_ratio: float = 0.1
) -> pd.DataFrame:
    """
    Remove feature columns whose variance is < `thresh_ratio` x var(label).

    Args:
        df : DataFrame
            Training frame (label + features).
        thresh_ratio : float
            Keep features whose var >= thresh_ratio * var(label).

    Returns:
        DataFrame
    """
    label_var = df[target_col].var()
    thresh = label_var * thresh_ratio

    keep_cols = [target_col] + [
        c for c in df.columns if c != target_col and df[c].var() >= thresh
    ]
    pruned = df[keep_cols].copy()
    print(f"Variance filter kept {len(keep_cols)-1} of {df.shape[1]-1} columns")
    return pruned

In [4]:
df_train = variance_filter(df_train_raw)

  sqr = _ensure_numeric((avg - values) ** 2)


Variance filter kept 855 of 895 columns


In [5]:
def corr_cluster_select(
    df: pd.DataFrame, target: str = "label", thresh: float = 0.90
) -> pd.DataFrame:
    """
    Target-aware correlation-clustering filter.

    Keeps at most **one** feature from every group of highly-correlated
    columns (|corr| ≥ `thresh`).  For each group the survivor is the
    feature with the strongest absolute correlation to the target.
    Args:
        df : DataFrame
            Input frame that still contains `target`.
        thresh : float
            Absolute Pearson correlation threshold that triggers grouping
            (default 0.90).
    Returns:
        DataFrame
            Same rows, but with duplicates pruned.
    """
    feats = df.drop(columns=[target])
    y = df[target]

    corr = feats.astype("float32").corr().abs()
    mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
    upper = corr.where(mask)

    survivors: List[str] = []
    processed = set()  # columns already clustered

    for col in upper.columns:
        if col in processed:
            continue

        # all features strongly correlated with `col`
        cluster = upper.index[upper[col] >= thresh].tolist()
        cluster.append(col)

        processed.update(cluster)

        # pick the column with largest |corr| to the label
        best = feats[cluster].corrwith(y).abs().idxmax()
        survivors.append(best)

    # deduplicate while preserving order
    survivors = list(dict.fromkeys(survivors))

    kept_df = df[[target] + survivors].copy()
    print(
        f"Correlation filter kept {len(survivors)} "
        f"of {feats.shape[1]} columns (thresh={thresh})"
    )
    return kept_df

In [6]:
df_train = corr_cluster_select(df_train)

Correlation filter kept 630 of 855 columns (thresh=0.9)


In [7]:
%store df_train

Stored 'df_train' (DataFrame)


In [8]:
def compute_feature_importance(
    df: pd.DataFrame,
    target_col: str = "label",
    n_splits: int = 5,
    seed: int = 42,
    importance_type: str = "gain",  # "gain" or "split",
    var_ratio: float = 0.1,
) -> pd.DataFrame:
    """
    Train LightGBM on each fold, average feature importances, and
    return a ranked table.

    Args:
        df : DataFrame
            Raw training frame (timestamp index, features + label).
        target_col : str
            Column name of the regression target.
        n_splits : int
            TimeSeriesSplit folds.
        importance_type : str
            "gain" - total gain of splits (default, more robust)
            "split" - number of times the feature is used in splits.

    Returns:
        DataFrame with columns [feature, importance] sorted descending.
    """
    y = df[target_col]
    x = df.drop(columns=[target_col])

    tscv = TimeSeriesSplit(n_splits=n_splits)
    imp_accum = pd.Series(0.0, index=x.columns)

    params = dict(
        objective="regression",
        learning_rate=0.1,
        num_leaves=256,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        seed=seed,
        verbose=-1,
    )

    for fold, (tr_idx, val_idx) in enumerate(tscv.split(x), 1):
        print(f"Training fold {fold}/{n_splits} …", end="\r")

        x_tr = x.iloc[tr_idx].astype("float32")
        y_tr = y.iloc[tr_idx]
        x_val = x.iloc[val_idx].astype("float32")
        y_val = y.iloc[val_idx]

        tr_ds = lgb.Dataset(x_tr, y_tr, free_raw_data=False)
        val_ds = lgb.Dataset(x_val, y_val, reference=tr_ds, free_raw_data=False)

        model = lgb.train(
            params,
            tr_ds,
            num_boost_round=1000,
            valid_sets=[val_ds],
            callbacks=[lgb.early_stopping(200)],
        )

        imp_accum += pd.Series(
            model.feature_importance(importance_type=importance_type),
            index=x.columns,
            dtype="float64",
        )

        del model, tr_ds, val_ds, x_tr, x_val  # <── release memory now
        gc.collect()

    imp_df = (
        imp_accum.div(n_splits)
        .sort_values(ascending=False)
        .reset_index()
        .rename(columns={"index": "feature", 0: "importance"})
    )

    return imp_df

In [None]:
# # Open this if variance and feature correlation are not used
# df_train = df_train_raw.copy()
# df_test = df_test_raw.copy()

In [9]:
# No need to run this if you already have the feature importance
# from a previous run, just load it instead.
# imp_df = pd.read_csv(f"{output_folder}feature_importance.csv")
imp_df = compute_feature_importance(
    df_train, target_col="label", n_splits=5, var_ratio=0.1
)
imp_df.to_csv(f"{output_folder}feature_importance.csv", index=False)
top_n = 48  # determined by optimize_topn_features with optuna method previously
print(f"Top {top_n} features:\n", imp_df.head(int(top_n)))

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 0.960421
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.02821
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.04097
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 0.943593
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 1.14101
Top 48 features:
    feature   importance
0     X686  2562.972731
1     X748  2480.580946
2     X466  2360.977019
3     X862  2200.282824
4     X885  2184.692510
5     X873  1933.185632
6     X344  1620.960222
7     X537  1586.908585
8      X86  1512.748166
9     X752  1469.943217
10    X786  1427.386214
11    X272  1411.428012
12    X465  1407.638287
13    X198  1362.122412
14   

In [13]:
cols_to_expand = imp_df.head(top_n)["feature"].tolist()
train_reduced = df_train_raw[["label", *cols_to_expand, "volume", "sell_qty", "buy_qty"]].copy()
test_reduced = df_test_raw[[*cols_to_expand, "volume", "sell_qty", "buy_qty"]].copy()
# x_train = train_reduced.drop(columns=["label"])
# y_train = df_train["label"]
%store train_reduced
%store test_reduced

Stored 'train_reduced' (DataFrame)
Stored 'test_reduced' (DataFrame)


In [15]:
# Build a one-step-ahead LOG-RETURN target
def make_return_target(df: pd.DataFrame, label_col: str = "label") -> pd.Series:
    """
    y_t = log(price_{t+1}) - log(price_t)
    (shift(-1) aligns *current* features with *next* return)
    """
    return np.log(df[label_col]).diff().shift(-1)


# Cross-validation that returns OoF predictions *and* fold models
def cv_lightgbm_returns(x: pd.DataFrame,
                        y_ret: pd.Series,
                        n_splits: int = 5,
                        seed: int = 42):

    tscv = TimeSeriesSplit(n_splits=n_splits)
    oof_pred = np.full(len(y_ret), np.nan, dtype=np.float32)
    models = []

    lgb_params = dict(
        objective="regression",
        metric="None",               # we supply Pearson ourselves
        boosting_type="gbdt",
        learning_rate=0.05,
        num_leaves=256,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        seed=seed,
        verbose=-1,
        num_threads=4,
    )

    def lgb_pearson(preds, train_data):
        r = np.corrcoef(train_data.get_label(), preds)[0, 1]
        return "pearson", r, True

    for fold, (tr_idx, val_idx) in enumerate(tscv.split(x), 1):
        x_tr, y_tr = x.iloc[tr_idx], y_ret.iloc[tr_idx]
        x_val, y_val = x.iloc[val_idx], y_ret.iloc[val_idx]

        m = lgb.train(
            lgb_params,
            lgb.Dataset(x_tr, y_tr),
            num_boost_round=4000,
            valid_sets=[lgb.Dataset(x_val, y_val)],
            feval=lgb_pearson,
            callbacks=[
                lgb.early_stopping(400, first_metric_only=True),
                lgb.log_evaluation(400),
            ],
        )
        models.append(m)

        oof_pred[val_idx] = m.predict(x_val, num_iteration=m.best_iteration)
        print(f"fold {fold}: Pearson = {pearsonr(y_val, oof_pred[val_idx])[0]:.5f}")

        del x_tr, y_tr, x_val, y_val
        gc.collect()

    mask = ~np.isnan(oof_pred)
    oof_r = pearsonr(y_ret.iloc[mask], oof_pred[mask])[0]
    print(f"OOF Pearson (whole series) = {oof_r:.5f}")

    best_iter = int(np.mean([m.best_iteration for m in models]))
    return oof_pred, models, best_iter, oof_r


# Re-train one final booster on *all* data, predict returns on test,
# then reconstruct price-level forecasts
def train_full_and_predict(models, best_iter,
                           x_train, y_ret, x_test,
                           last_train_price: float) -> np.ndarray:
    """
    last_train_price  - price at row T (the final row of the training period);
                        needed to reconstruct price_t+1 from predicted return_t.
    """
    full_ds = lgb.Dataset(x_train, y_ret)
    full_model = lgb.train(models[0].params, full_ds,
                           num_boost_round=best_iter)

    ret_pred = full_model.predict(x_test)

    # price_{t+1} = price_t * exp(return_t)
    price_pred = np.empty_like(ret_pred)
    p_prev = last_train_price
    for i, r in enumerate(ret_pred):
        p_next = p_prev * np.exp(r)
        price_pred[i] = p_next
        p_prev = p_next            # chain for multi-step test horizon

    return price_pred


# End-to-end helper you call from your notebook / script
def run_return_lgbm(train_df: pd.DataFrame,
                    test_df: pd.DataFrame,
                    feature_cols: list | None = None):
    """
    train_df  - your already-filtered frame that *still contains* 'label'
    test_df   - same features (no label)
    """
    print("building return target")
    y_ret = make_return_target(train_df).dropna()
    train_df = train_df.loc[y_ret.index]          # drop first row (NaN return)

    x_train = train_df.drop(columns=["label"]) if feature_cols is None \
              else train_df[feature_cols]
    x_test  = test_df[x_train.columns]            # keep same order

    print("CV training")
    oof_pred, models, best_iter, _ = cv_lightgbm_returns(x_train, y_ret)

    last_price = train_df["label"].iloc[-1]
    print(f"full-data training (best_iter={best_iter}) and test prediction")
    price_pred = train_full_and_predict(models, best_iter,
                                        x_train, y_ret, x_test,
                                        last_price)

    return price_pred


In [None]:
price_pred = run_return_lgbm(train_reduced, test_reduced)

sample = pd.read_csv(f"{input_folder}sample_submission.csv")
sample[sample.columns[-1]] = price_pred.astype(np.float32)
sample.to_csv(f"{output_folder}submission_return_lgbm.csv", index=False)

building return target
CV training


  result = getattr(ufunc, method)(*inputs, **kwargs)


Training until validation scores don't improve for 400 rounds
[400]	valid_0's pearson: 0.0164427
Early stopping, best iteration is:
[11]	valid_0's pearson: 0.0197526
Evaluated only: pearson
fold 1: Pearson = 0.01975
Training until validation scores don't improve for 400 rounds
[400]	valid_0's pearson: 0.0149598
Early stopping, best iteration is:
[19]	valid_0's pearson: 0.0189252
Evaluated only: pearson
fold 2: Pearson = 0.01893
Training until validation scores don't improve for 400 rounds
[400]	valid_0's pearson: 0.0143096
Early stopping, best iteration is:
[126]	valid_0's pearson: 0.0161513
Evaluated only: pearson
fold 3: Pearson = 0.01615
Training until validation scores don't improve for 400 rounds
[400]	valid_0's pearson: 0.00664923
[800]	valid_0's pearson: 0.00550347
Early stopping, best iteration is:
[552]	valid_0's pearson: 0.00720419
Evaluated only: pearson
fold 4: Pearson = 0.00720
Training until validation scores don't improve for 400 rounds
[400]	valid_0's pearson: 0.0048040

  p_next = p_prev * np.exp(r)
  sample[sample.columns[-1]] = price_pred.astype(np.float32)


: 