# Simple RandomForestRegressor - Stock Prediction

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder

from ml_market.data import fetch_ohlcv, load_sector_data, load_macro_data
from ml_market.features import compute_all_features

In [2]:
TICKERS = [
    'NVDA',
    'MSFT',
    'AAPL',
    'AVGO',
    'AMZN',
    'TSLA',
    'META',
    'GOOGL',
    'GOOG',
    'NFLX',
]
START = "2015-01-01"
END = "2025-01-01"

# Load data
stocks_df = fetch_ohlcv(TICKERS, START, END)
sector_df = load_sector_data(start=START, end=END)
macro_df = load_macro_data(start=START, end=END)

df = compute_all_features(stocks_df, sector_df, macro_df)
print(f"Shape: {df.shape}")

# map tickers -> integer codes
unique_tickers = sorted(df['ticker'].unique())
ticker_to_code = {t: i for i, t in enumerate(unique_tickers)}
df['ticker_code'] = df['ticker'].map(ticker_to_code)
# drop ticker string
df = df.drop(columns=['ticker'])
# move ticker_code to first col
cols = ["ticker_code"] + [c for c in df.columns if c != "ticker_code"]
df = df[cols]

print("Ticker mapping:", ticker_to_code)
print(f"Final shape with ticker_code: {df.shape}")

df = df.sort_values(['date', 'ticker_code'])
df.head(len(ticker_to_code))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Shape: (23140, 112)
Ticker mapping: {'AAPL': 0, 'AMZN': 1, 'AVGO': 2, 'GOOG': 3, 'GOOGL': 4, 'META': 5, 'MSFT': 6, 'NFLX': 7, 'NVDA': 8, 'TSLA': 9}
Final shape with ticker_code: (23140, 112)


Unnamed: 0,ticker_code,date,open,high,low,close,volume,lag0_return_1d,lag1_return_1d,lag5_return_1d,...,dxy_ret_1d,dxy_vol_20,dxy_mom_10,vix_close,vix_ret_1d,vix_vol_20,vix_mom_10,spread_stock_vs_sector,spread_stock_vs_spy,spread_qqq_vs_spy
0,0,2015-10-16,25.095493,25.144885,24.814858,24.929358,156930400,-0.007331,0.014971,0.023927,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,-0.009241,-0.011877,-0.000367
1,1,2015-10-16,28.2635,28.547001,28.015499,28.538,86316000,0.014793,0.032322,0.012454,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.012883,0.010246,-0.000367
2,2,2015-10-16,9.307852,9.31396,9.155139,9.244476,19866000,-0.006564,0.037545,0.001886,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,-0.008474,-0.01111,-0.000367
3,3,2015-10-16,32.979812,33.022519,32.636659,32.88496,32222000,0.000695,0.016248,0.006962,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,-0.001215,-0.003851,-0.000367
4,4,2015-10-16,34.531829,34.572549,34.218988,34.527855,36316000,0.003319,0.018533,0.006357,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.001409,-0.001228,-0.000367
5,5,2015-10-16,95.590715,96.992072,94.765798,96.942383,25412900,0.016465,0.020091,0.008327,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.014555,0.011919,-0.000367
6,6,2015-10-16,40.895576,41.347845,40.791207,41.321751,26450300,0.010636,0.00707,-0.007165,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.008726,0.00609,-0.000367
7,7,2015-10-16,10.021,10.165,9.841,9.899,213405000,-0.020774,-0.082917,-0.013922,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,-0.022684,-0.02532,-0.000367
8,8,2015-10-16,0.668782,0.683363,0.667081,0.677044,448396000,0.015677,0.002558,-0.003821,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.013767,0.01113,-0.000367
9,9,2015-10-16,14.869333,15.365333,14.858,15.134,65017500,0.025756,0.020426,-0.026597,...,0.001801,0.004504,-0.013461,15.05,-0.062305,0.075163,-0.28128,0.023846,0.021209,-0.000367


In [3]:
df.columns.tolist()

['ticker_code',
 'date',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'lag0_return_1d',
 'lag1_return_1d',
 'lag5_return_1d',
 'lag10_return_1d',
 'lag20_return_1d',
 'lag0_return_5d',
 'lag1_return_5d',
 'lag5_return_5d',
 'lag10_return_5d',
 'lag20_return_5d',
 'lag0_return_10d',
 'lag1_return_10d',
 'lag5_return_10d',
 'lag10_return_10d',
 'lag20_return_10d',
 'lag0_return_20d',
 'lag1_return_20d',
 'lag5_return_20d',
 'lag10_return_20d',
 'lag20_return_20d',
 'return_rolling_mean_5d',
 'return_rolling_std_5d',
 'return_rolling_std_20d',
 'return_volatility_ratio_5d_20d',
 'return_autocorr_5d',
 'trend_sma_5d',
 'trend_sma_10d',
 'trend_sma_20d',
 'trend_sma_50d',
 'trend_sma_200d',
 'trend_ema_10d',
 'trend_ema_20d',
 'trend_ema_50d',
 'trend_sma_diff_5d_20d',
 'trend_sma_diff_10d_50d',
 'trend_macd',
 'trend_macd_signal',
 'trend_macd_histogram',
 'trend_price_to_sma_20d',
 'trend_price_to_sma_50d',
 'trend_roc_10d',
 'trend_roc_20d',
 'trend_adx_14d',
 'momentum_rsi_14d',
 'mo

In [4]:
drop_cols = [
    "date",
    "target_return_1d",
    "target_return_5d",
    "target_return_10d",
    "target_direction_up_1d",
    "target_direction_3class_1d",
]
suspect_cols = [
'xlk_close',
'xlk_ret_1d',
'xlk_vol_20',
'xlk_mom_10',
'spy_close',
'spy_ret_1d',
'spy_vol_20',
'spy_mom_10',
'qqq_close',
'qqq_ret_1d',
'qqq_vol_20',
'qqq_mom_10',
'tlt_close',
'tlt_ret_1d',
'tlt_vol_20',
'tlt_mom_10',
'dxy_close',
'dxy_ret_1d',
'dxy_vol_20',
'dxy_mom_10',
'vix_close',
'vix_ret_1d',
'vix_vol_20',
'vix_mom_10',
'spread_stock_vs_sector',
'spread_stock_vs_spy',
'spread_qqq_vs_spy'
]
drop_cols_complete = drop_cols + suspect_cols

feature_cols = [c for c in df.columns if c not in drop_cols_complete]
target = "target_return_1d"
dates = df['date']

X = df[feature_cols]
y = df[target]

In [5]:
import numpy as np

# ---- Walk-forward settings ----
TRAIN_FRAC = 0.70
TEST_FRAC  = 0.05
EMBARGO    = 10     # because your max lookahead = 10d

# ---- Prepare date universe ----
unique_dates = np.array(sorted(dates.unique()))
N = len(unique_dates)

train_size = int(N * TRAIN_FRAC)
test_size  = int(N * TEST_FRAC)

splits = []
i = 0

# ---- Build walk-forward splits ----
while i + train_size + EMBARGO + test_size <= N:
    train_dates = unique_dates[i : i + train_size]
    test_dates  = unique_dates[i + train_size + EMBARGO :
                               i + train_size + EMBARGO + test_size]
    
    train_mask = dates.isin(train_dates)
    test_mask  = dates.isin(test_dates)

    X_train, y_train = X[train_mask], y[train_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]

    splits.append((X_train, y_train, X_test, y_test))
    
    i += test_size   # slide window by size of test period


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rf_metrics = {
    "rmse": [],
    "mae": [],
    "corr": [],
    "dir_acc": [],
    "signed_error": []
}

for X_train, y_train, X_test, y_test in splits:
    model = RandomForestRegressor(
        n_estimators=400,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # core metrics
    mse = mean_squared_error(y_test, preds)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, preds)

    # correlation (nan-safe)
    corr = np.corrcoef(y_test, preds)[0, 1] if len(y_test) > 2 else np.nan

    # directional accuracy
    dir_acc = np.mean(np.sign(preds) == np.sign(y_test))

    # signed error (bias detection)
    signed_err = np.mean(preds - y_test)

    rf_metrics["rmse"].append(rmse)
    rf_metrics["mae"].append(mae)
    rf_metrics["corr"].append(corr)
    rf_metrics["dir_acc"].append(dir_acc)
    rf_metrics["signed_error"].append(signed_err)

print("RandomForest Metrics:")
for k, v in rf_metrics.items():
    print(f"{k}: {np.nanmean(v):.6f}")


RandomForest Metrics:
rmse: 0.022843
mae: 0.015209
corr: 0.474568
dir_acc: 0.687826
signed_error: 0.000315


In [7]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb_metrics = {
    "rmse": [],
    "mae": [],
    "corr": [],
    "dir_acc": [],
    "signed_error": []
}

for X_train, y_train, X_test, y_test in splits:
    model = XGBRegressor(
        n_estimators=700,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        tree_method="hist"
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mse = mean_squared_error(y_test, preds)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, preds)

    corr = np.corrcoef(y_test, preds)[0, 1] if len(y_test) > 2 else np.nan

    dir_acc = np.mean(np.sign(preds) == np.sign(y_test))
    signed_err = np.mean(preds - y_test)

    xgb_metrics["rmse"].append(rmse)
    xgb_metrics["mae"].append(mae)
    xgb_metrics["corr"].append(corr)
    xgb_metrics["dir_acc"].append(dir_acc)
    xgb_metrics["signed_error"].append(signed_err)

print("XGBoost Metrics:")
for k, v in xgb_metrics.items():
    print(f"{k}: {np.nanmean(v):.6f}")


XGBoost Metrics:
rmse: 0.022706
mae: 0.015093
corr: 0.476626
dir_acc: 0.684000
signed_error: 0.000107


In [8]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

lgb_metrics = {
    "rmse": [],
    "mae": [],
    "corr": [],
    "dir_acc": [],
    "signed_error": []
}

for X_train, y_train, X_test, y_test in splits:
    model = lgb.LGBMRegressor(
        n_estimators=700,
        learning_rate=0.02,
        subsample=0.8,
        colsample_bytree=0.8
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mse = mean_squared_error(y_test, preds)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, preds)

    corr = np.corrcoef(y_test, preds)[0, 1] if len(y_test) > 2 else np.nan

    dir_acc = np.mean(np.sign(preds) == np.sign(y_test))
    signed_err = np.mean(preds - y_test)

    lgb_metrics["rmse"].append(rmse)
    lgb_metrics["mae"].append(mae)
    lgb_metrics["corr"].append(corr)
    lgb_metrics["dir_acc"].append(dir_acc)
    lgb_metrics["signed_error"].append(signed_err)

print("LightGBM Metrics:")
for k, v in lgb_metrics.items():
    print(f"{k}: {np.nanmean(v):.6f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001000 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19432
[LightGBM] [Info] Number of data points in the train set: 16190, number of used features: 79
[LightGBM] [Info] Start training from score 0.001447
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19432
[LightGBM] [Info] Number of data points in the train set: 16190, number of used features: 79
[LightGBM] [Info] Start training from score 0.001208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19432
[LightGBM] [Info] Number of data points in the train set: 16190, number of used features: 79
[LightGBM] [Info] Start 

In [9]:
import pandas as pd
import numpy as np

summary_df = pd.DataFrame({
    "Model": ["RandomForest", "XGBoost", "LightGBM"],
    "RMSE": [
        np.nanmean(rf_metrics["rmse"]),
        np.nanmean(xgb_metrics["rmse"]),
        np.nanmean(lgb_metrics["rmse"])
    ],
    "MAE": [
        np.nanmean(rf_metrics["mae"]),
        np.nanmean(xgb_metrics["mae"]),
        np.nanmean(lgb_metrics["mae"])
    ],
    "Correlation": [
        np.nanmean(rf_metrics["corr"]),
        np.nanmean(xgb_metrics["corr"]),
        np.nanmean(lgb_metrics["corr"])
    ],
    "Directional_Acc": [
        np.nanmean(rf_metrics["dir_acc"]),
        np.nanmean(xgb_metrics["dir_acc"]),
        np.nanmean(lgb_metrics["dir_acc"])
    ],
    "Signed_Error": [
        np.nanmean(rf_metrics["signed_error"]),
        np.nanmean(xgb_metrics["signed_error"]),
        np.nanmean(lgb_metrics["signed_error"])
    ]
})

summary_df


Unnamed: 0,Model,RMSE,MAE,Correlation,Directional_Acc,Signed_Error
0,RandomForest,0.022843,0.015209,0.474568,0.687826,0.000315
1,XGBoost,0.022706,0.015093,0.476626,0.684,0.000107
2,LightGBM,0.022449,0.014875,0.491498,0.691826,-6.7e-05


In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Train one LightGBM model on the full training window (not walk-forward)
# This is okay for feature-importance only
model_full = lgb.LGBMRegressor(
    n_estimators=700,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8
)

model_full.fit(X, y)

# Extract importances
gain_importance = model_full.booster_.feature_importance(importance_type='gain')
split_importance = model_full.booster_.feature_importance(importance_type='split')

feature_names = model_full.booster_.feature_name()

fi_df = pd.DataFrame({
    "feature": feature_names,
    "gain": gain_importance,
    "split": split_importance
}).sort_values("gain", ascending=False)

fi_df.head(20)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19432
[LightGBM] [Info] Number of data points in the train set: 23140, number of used features: 79
[LightGBM] [Info] Start training from score 0.001408


Unnamed: 0,feature,gain,split
6,lag0_return_1d,67.195173,906
0,ticker_code,12.345509,589
71,stat_return_zscore_20d,8.308432,538
74,stat_return_percentile_20d,5.62352,174
8,lag5_return_1d,3.229267,565
64,volume_ratio_to_sma,2.906633,475
7,lag1_return_1d,2.881649,601
68,volume_obv,2.82503,513
69,volume_obv_sma_20d,2.650381,470
5,volume,2.600474,448
