In [1]:
from ml_market.data import fetch_ohlcv, load_sector_data, load_macro_data
from ml_market.features import compute_all_features

TICKERS = ["AAPL"]
START = "2010-01-01"
END = "2025-01-01"

stocks_df = fetch_ohlcv(TICKERS, start=START, end=END)
sector_df = load_sector_data(start=START, end=END)
macro_df = load_macro_data(start=START, end=END)

df = compute_all_features(stocks_df, sector_df, macro_df)
df = df.sort_values(["date", "ticker"]).reset_index(drop=True)
df.head()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,date,ticker,open,high,low,close_x,volume,ret1d,ret5d,ret10d,...,vix_close,vix_ret_1d,vix_vol_20,vix_mom_10,stock_vs_sector,stock_vs_spy,qqq_vs_spy,regime_vol,regime_trend,regime_momentum
0,2010-10-18,AAPL,9.551249,9.567143,9.425886,9.537153,1093010800,0.010358,0.076652,0.141258,...,19.09,0.003153,0.045021,-0.188695,0.003716,0.00543,-0.008618,low,up,strong
1,2010-10-19,AAPL,9.099282,9.410288,8.997912,9.281927,1232784000,-0.026761,0.036679,0.071122,...,20.629999,0.08067,0.048001,-0.05193,-0.009029,-0.013657,0.003748,low,neutral,neutral
2,2010-10-20,AAPL,9.26723,9.424683,9.20335,9.313116,721624400,0.00336,0.034617,0.073792,...,19.790001,-0.040717,0.048663,-0.079107,-0.005037,-0.006406,-0.002486,low,neutral,strong
3,2010-10-21,AAPL,9.368003,9.439382,9.201253,9.282829,551460000,-0.003252,0.02385,0.070189,...,19.27,-0.026276,0.046307,-0.106215,-0.004084,-0.005458,-0.000252,low,up,strong
4,2010-10-22,AAPL,9.269329,9.298421,9.186255,9.221344,372778000,-0.006624,-0.023099,0.045567,...,18.780001,-0.025428,0.042451,-0.093192,-0.009535,-0.008486,0.004961,low,neutral,strong


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# =========================
# WALK-FORWARD SPLIT MAKER
# =========================
def walk_forward_splits(df_length, test_size=200, step=200):
    splits = []
    start = test_size

    while start + test_size < df_length:
        train_idx = list(range(0, start))
        test_idx = list(range(start, start + test_size))
        splits.append((train_idx, test_idx))
        start += step

    return splits


# =========================
# EVALUATION FUNCTION
# =========================
def evaluate_walk_forward(model, X, y, splits):
    results = []

    for i, (train_idx, test_idx) in enumerate(splits):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, preds))
        corr = np.corrcoef(y_test, preds)[0, 1]
        direction_acc = (np.sign(preds) == np.sign(y_test)).mean()

        print(
            f"Split {i+1}: RMSE={rmse:.5f}, "
            f"Corr={corr:.5f}, DirAcc={direction_acc:.3f}"
        )

        results.append({
            "split": i+1,
            "rmse": float(rmse),
            "corr": float(corr),
            "direction_acc": float(direction_acc)
        })

    # Summary stats
    df_res = pd.DataFrame(results)
    print("\n===== SUMMARY =====")
    print(df_res.mean(numeric_only=True))

    return df_res


Num splits: 16


In [6]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

models = {
    "RF": RandomForestRegressor(
        n_estimators=400,
        max_depth=10,
        max_features="sqrt",
        n_jobs=-1,
        random_state=42
    ),
    "XGB": XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
        random_state=42
    ),
    "LGBM": LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),
}