In [2]:
import qlib
import optuna
import pickle
import pandas as pd
from utils import prints
from qlib.data.dataset import DatasetH
from datetime import datetime, timedelta
from qlib.data import D
import numpy as np
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandler
import lightgbm as lgb


In [3]:
# ----------------------------
# Config
# ----------------------------
START_DATE = "2018-01-01"
END_DATE = (datetime.today() - timedelta(days=30)).strftime("%Y-%m-%d")    #END_DATE = "2030-10-18" Adjust this if it caused error

INSTRUMENTS = "all"
MAX_TRIALS = 50
MODEL_PATH = "trained_model_2.pkl"



In [4]:
class LoaderWrapper(DataHandler):
    def __init__(self, loader):
        # Defensive extraction
        feature_df = loader._config.get("feature")
        label_df = loader._config.get("label")

        if not isinstance(feature_df, pd.DataFrame) or not isinstance(label_df, pd.DataFrame):
            raise TypeError("Expected DataFrames for 'feature' and 'label'")

        self.data_loader = loader
        self._data = pd.concat({"feature": feature_df, "label": label_df}, axis=1)

        # Required attributes for DatasetH
        prints(feature_df.index.names)
        prints(feature_df.head())
        self.instruments = sorted(set(feature_df.index.get_level_values("instrument")))
        self.start_time = str(feature_df.index.get_level_values("datetime").min().date())
        self.end_time = str(feature_df.index.get_level_values("datetime").max().date())
        self.fetch_orig = True

    def fetch(self, instruments=None, start_time=None, end_time=None, freq="day", col_set="__all", data_key=None):
        if col_set == "__all":
            return self._data
        return self._data.xs(col_set, axis=1, level=0)


In [5]:

qlib.init(provider_uri="C:/Users/harve/.qlib/qlib_data/us_data", region="us")



[45516:MainThread](2025-11-30 01:38:51,518) INFO - qlib.Initialization - [config.py:452] - default_conf: client.


[45516:MainThread](2025-11-30 01:38:51,526) INFO - qlib.Initialization - [__init__.py:79] - qlib successfully initialized based on client settings.
[45516:MainThread](2025-11-30 01:38:51,529) INFO - qlib.Initialization - [__init__.py:81] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/harve/.qlib/qlib_data/us_data')}


In [6]:
instrument_path = r"C:\Users\harve\.qlib\qlib_data\us_data\instruments\all.txt"

with open(instrument_path, "r") as f:
    instrumentx = [line.strip().split("\t")[0] for line in f if line.strip()]

fields = ["$open", "$high", "$low", "$close", "$volume",
    "$vol_5d", "$rank_vol_5d",
    # "$ret_5d", "$rank_ret_5d",
    # "$ret_10d", "$vol_10d", "$rank_ret_10d", "$rank_vol_10d",
    # "$ret_20d", "$vol_20d", "$rank_ret_20d", "$rank_vol_20d",
    ]
features = D.features(
    instruments=instrumentx,
    fields=fields,
    start_time=START_DATE,
    end_time=END_DATE
)

labels = D.features(
    instruments=instrumentx,
    fields=["$ensemble_label"],
    start_time=START_DATE,
    end_time=END_DATE
)
from qlib.data.dataset.loader import StaticDataLoader

loader = StaticDataLoader(config={
    "feature": features,
    "label": labels
})


In [16]:
handler = LoaderWrapper(loader)

END_TRAIN_DATE = (datetime.today() - timedelta(days=95)).strftime("%Y-%m-%d")
START_VALID_DATE = (datetime.today() - timedelta(days=90)).strftime("%Y-%m-%d")
dataset = DatasetH(
    handler=handler,
    segments={
        "train": (START_DATE, END_TRAIN_DATE),
        "valid": (START_VALID_DATE, END_DATE)
    }
)

df = dataset.prepare("train")

X = df.xs("feature", axis=1, level=0)
y = df.xs("label", axis=1, level=0)

X["$volume_log"] = np.log1p(X["$volume"])
X.drop(columns=["$volume"], inplace=True)

y_flat = y.squeeze()
y_flat.index = X.index
y_flat = y_flat.loc[X.index]

# Drop rows with NaN labels
mask = ~y_flat.isna()
X = X.loc[mask]
y_flat = y_flat.loc[mask]

# Optionally also drop rows with NaN in critical features
X = X.dropna(subset=["$vol_5d", "$rank_vol_5d"])
y_flat = y_flat.loc[X.index]

prints(f"Training rows after cleaning: {len(X)}")
prints(f"Remaining NaN labels: {y_flat.isna().sum()}")
prints(f"Remaining NaN features: {X.isna().sum().sum()}")

# ðŸ“Š Correlation diagnostics
for col in ["$ret_5d", "$ret_10d", "$ret_20d"]:
    if col in X.columns:
        corr = X[col].corr(y_flat)
        prints(f"Correlation with ensemble_label: {col:<10} â†’ {corr:.4f}")


['instrument', 'datetime']
                          $open     $high      $low    $close       $volume  \
instrument datetime                                                           
AAPL       2018-01-02  0.987809  1.000232  0.982584  1.000000  4.402259e+09   
           2018-01-03  1.001567  1.013294  0.998258  0.999826  5.084753e+09   
           2018-01-04  1.001626  1.007024  0.998955  1.004470  3.864584e+09   
           2018-01-05  1.006850  1.018054  1.004586  1.015906  4.075671e+09   
           2018-01-08  1.012133  1.019448  1.009695  1.012133  3.543008e+09   

                       $vol_5d  $rank_vol_5d  
instrument datetime                           
AAPL       2018-01-02      NaN           NaN  
           2018-01-03      NaN           NaN  
           2018-01-04      NaN           NaN  
           2018-01-05      NaN           NaN  
           2018-01-08      NaN           NaN  


Training rows after cleaning: 68418
Remaining NaN labels: 0
Remaining NaN features: 0


In [17]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "mse",
        "num_leaves": trial.suggest_int("num_leaves", 32, 256),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 24),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X, y_flat)

    preds = model.predict(X)
    mse = np.mean((preds - y_flat.values.flatten())**2)
    return mse


In [19]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)

prints(f"Best value: {study.best_trial.value}")
prints(f"  MSE: {study.best_value:.6f}")
prints(study.best_trial.params)

best_params = study.best_trial.params
model = lgb.LGBMRegressor(**best_params)
model.fit(X, y_flat)

with open(MODEL_PATH, "wb") as f:
    pickle.dump({"model": model, "columns": X.columns.tolist()}, f)

prints(f"\nðŸ“¦ Tuned model saved to {MODEL_PATH}")


[I 2025-12-04 16:46:27,984] A new study created in memory with name: no-name-38262df6-b1ee-44fe-887b-4eded0d3aa61


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 68418, number of used features: 7
[LightGBM] [Info] Start training from score 0.010857


[I 2025-12-04 16:46:28,896] Trial 0 finished with value: 0.003882224248752172 and parameters: {'num_leaves': 76, 'learning_rate': 0.06791558413490847, 'n_estimators': 222, 'max_depth': 18, 'min_child_samples': 49, 'subsample': 0.5536796022769427, 'colsample_bytree': 0.8074305243693431}. Best is trial 0 with value: 0.003882224248752172.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 68418, number of used features: 7
[LightGBM] [Info] Start training from score 0.010857


[I 2025-12-04 16:46:30,441] Trial 1 finished with value: 0.0034493901058296164 and parameters: {'num_leaves': 235, 'learning_rate': 0.09785861179451893, 'n_estimators': 356, 'max_depth': 11, 'min_child_samples': 67, 'subsample': 0.9205607333846189, 'colsample_bytree': 0.9675086770974517}. Best is trial 1 with value: 0.0034493901058296164.


Best value: 0.0034493901058296164
  MSE: 0.003449
{'num_leaves': 235, 'learning_rate': 0.09785861179451893, 'n_estimators': 356, 'max_depth': 11, 'min_child_samples': 67, 'subsample': 0.9205607333846189, 'colsample_bytree': 0.9675086770974517}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 68418, number of used features: 7
[LightGBM] [Info] Start training from score 0.010857

ðŸ“¦ Tuned model saved to trained_model_2.pkl


In [20]:
importances = model.feature_importances_
features = model.feature_name_
for name, score in sorted(zip(features, importances), key=lambda x: x[1], reverse=True):
    prints(f"Feature: {name:<20} Importance: {score}")

# Step 1: Prepare validation features
X_valid = dataset.prepare("valid", col_set="feature")
y_valid = dataset.prepare("valid", col_set="label")

# Step 2: Apply same feature engineering
X_valid["$volume_log"] = np.log1p(X_valid["$volume"])
X_valid.drop(columns=["$volume"], inplace=True)

# Step 3: Drop rows with NaN labels
y_valid_flat = y_valid.squeeze()
mask = ~y_valid_flat.isna()
X_valid = X_valid.loc[mask]
y_valid_flat = y_valid_flat.loc[mask]

# Step 4: Predict
preds_valid = model.predict(X_valid)

# Step 5: Evaluate MSE
from sklearn.metrics import mean_squared_error
mse_valid = mean_squared_error(y_valid_flat, preds_valid)
prints(f"Validation MSE: {mse_valid}")

# Step 6: Optional â€” log IC
from scipy.stats import spearmanr
ic = spearmanr(preds_valid, y_valid_flat.values).correlation
prints(f"Validation IC: {ic}")


Feature: $volume_log          Importance: 6630
Feature: $vol_5d              Importance: 6517
Feature: $rank_vol_5d         Importance: 3844
Feature: $open                Importance: 2671
Feature: $high                Importance: 2643
Feature: $low                 Importance: 2609
Feature: $close               Importance: 2422
Validation MSE: 0.0034613076095945557
Validation IC: 0.5110513687298898
