In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle
import joblib

# Optional: import XGBoost and LightGBM if installed
try:
    from xgboost import XGBRegressor
    xgb_installed = True
except ImportError:
    xgb_installed = False

try:
    from lightgbm import LGBMRegressor
    lgbm_installed = True
except ImportError:
    lgbm_installed = False

In [5]:
# Load master features and importances
df = pd.read_csv("../../sp500_master_features.csv")
# Feature importance order (excluding ticker_encoded and volatility_10d)
feature_importance_order = [
    'return_21d',
    'return_5d',
    'return_1d',
    'rsi_14',
    'volume_avg_21d',
    'macd_hist',
    'macd_signal',
    'volume_avg_10d',
    'momentum_10d',
    'macd',
    'ma_50',
    'momentum_21d',
    'ma_20',
    'ma_5'
]

target = 'target_volatility_10d'
ticker_col = 'ticker'

In [6]:
# Experiment settings
N_values = [5, 10, 13]  # Try top 5, 10, all features
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "LinearRegression": LinearRegression()
}
if xgb_installed:
    models["XGBoost"] = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0)
if lgbm_installed:
    models["LightGBM"] = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1)

results = []

In [7]:
# Set up for experiments
N_values = [5, 10, 13]  # Try top 5, 10, all features

# Prepare encoders (run this once)
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [8]:
results_rf = []
for N in N_values:
    top_features = feature_importance_order[:N]
    df_exp = df.dropna(subset=top_features + [target, ticker_col])
    df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])
    X = pd.concat([df_exp[top_features], df_exp[['ticker_encoded']]], axis=1)
    y = df_exp[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results_rf.append({"N_features": N, "model": "RandomForest", "RMSE": rmse, "R2": r2})
    print(f"RandomForest N={N}: RMSE={rmse:.4f}, R2={r2:.4f}")
results_rf = pd.DataFrame(results_rf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


RandomForest N=5: RMSE=0.0093, R2=0.3303


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


RandomForest N=10: RMSE=0.0089, R2=0.3835


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


RandomForest N=13: RMSE=0.0086, R2=0.4169


In [9]:
results_lr = []
for N in N_values:
    top_features = feature_importance_order[:N]
    df_exp = df.dropna(subset=top_features + [target, ticker_col])
    df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])
    X = pd.concat([df_exp[top_features], df_exp[['ticker_encoded']]], axis=1)
    y = df_exp[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results_lr.append({"N_features": N, "model": "LinearRegression", "RMSE": rmse, "R2": r2})
    print(f"LinearRegression N={N}: RMSE={rmse:.4f}, R2={r2:.4f}")
results_lr = pd.DataFrame(results_lr)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LinearRegression N=5: RMSE=0.0111, R2=0.0396


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LinearRegression N=10: RMSE=0.0111, R2=0.0435


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LinearRegression N=13: RMSE=0.0110, R2=0.0439


In [10]:
results_xgb = []
if xgb_installed:
    for N in N_values:
        top_features = feature_importance_order[:N]
        df_exp = df.dropna(subset=top_features + [target, ticker_col])
        df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])
        X = pd.concat([df_exp[top_features], df_exp[['ticker_encoded']]], axis=1)
        y = df_exp[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)  # RMSE
        r2 = r2_score(y_test, y_pred)
        results_xgb.append({"N_features": N, "model": "XGBoost", "RMSE": rmse, "R2": r2})
        print(f"XGBoost N={N}: RMSE={rmse:.4f}, R2={r2:.4f}")
    results_xgb = pd.DataFrame(results_xgb)
else:
    print("XGBoost is not installed.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


XGBoost N=5: RMSE=0.0090, R2=0.3717


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


XGBoost N=10: RMSE=0.0088, R2=0.3914


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


XGBoost N=13: RMSE=0.0086, R2=0.4153


In [11]:
results_lgbm = []
if lgbm_installed:
    for N in N_values:
        top_features = feature_importance_order[:N]
        df_exp = df.dropna(subset=top_features + [target, ticker_col])
        df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])
        X = pd.concat([df_exp[top_features], df_exp[['ticker_encoded']]], axis=1)
        y = df_exp[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
        r2 = r2_score(y_test, y_pred)
        results_lgbm.append({"N_features": N, "model": "LightGBM", "RMSE": rmse, "R2": r2})
        print(f"LightGBM N={N}: RMSE={rmse:.4f}, R2={r2:.4f}")
    results_lgbm = pd.DataFrame(results_lgbm)
else:
    print("LightGBM is not installed.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LightGBM N=5: RMSE=0.0091, R2=0.3477


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LightGBM N=10: RMSE=0.0090, R2=0.3609


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['ticker_encoded'] = le.fit_transform(df_exp[ticker_col])


LightGBM N=13: RMSE=0.0088, R2=0.3775


In [12]:
# Combine all results into one DataFrame
all_results = pd.concat(
    [
        results_rf,
        results_lr,
        results_xgb if 'results_xgb' in locals() else pd.DataFrame(),
        results_lgbm if 'results_lgbm' in locals() else pd.DataFrame()
    ],
    ignore_index=True
)

all_results.to_csv("model_experiment_results.csv", index=False)
print("\nSummary of all experiments:")
print(all_results)


Summary of all experiments:
    N_features             model      RMSE        R2
0            5      RandomForest  0.009262  0.330291
1           10      RandomForest  0.008886  0.383528
2           13      RandomForest  0.008558  0.416936
3            5  LinearRegression  0.011091  0.039634
4           10  LinearRegression  0.011068  0.043494
5           13  LinearRegression  0.010959  0.043874
6            5           XGBoost  0.008971  0.371725
7           10           XGBoost  0.008829  0.391450
8           13           XGBoost  0.008570  0.415262
9            5          LightGBM  0.009141  0.347669
10          10          LightGBM  0.009047  0.360916
11          13          LightGBM  0.008842  0.377512


In [13]:
# Save the best model
best_row = all_results.sort_values(by="RMSE").iloc[0]
best_N = best_row["N_features"]
best_model_name = best_row["model"]

print(f"\nBest model: {best_model_name} with top {best_N} features (RMSE={best_row['RMSE']:.6f}, R2={best_row['R2']:.4f})")

# Retrain best model on full data
top_features = feature_importance_order[:int(best_N)]
df_best = df.dropna(subset=top_features + [target, ticker_col]).copy()
le = LabelEncoder()
df_best['ticker_encoded'] = le.fit_transform(df_best[ticker_col])
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ticker_ohe = ohe.fit_transform(df_best[[ticker_col]])
ticker_ohe_df = pd.DataFrame(ticker_ohe, columns=[f"ticker_{cat}" for cat in ohe.categories_[0]], index=df_best.index)

if best_model_name == "LinearRegression":
    X_best = pd.concat([df_best[top_features], ticker_ohe_df], axis=1)
    best_model = LinearRegression()
else:
    X_best = pd.concat([df_best[top_features], df_best[['ticker_encoded']]], axis=1)
    if best_model_name == "RandomForest":
        best_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    elif best_model_name == "XGBoost" and xgb_installed:
        best_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0)
    elif best_model_name == "LightGBM" and lgbm_installed:
        best_model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1)
    else:
        raise ValueError("Unknown model type for retraining.")

best_model.fit(X_best, df_best[target])




Best model: RandomForest with top 13 features (RMSE=0.008558, R2=0.4169)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# Save the model using joblib (more efficient for large models)
joblib.dump(best_model, "best_model.pkl")
print("Best model saved as best_model.pkl")

Best model saved as best_model.pkl


In [15]:
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [None]:
import joblib
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load model and encoder
model = joblib.load("best_model.pkl")
le = joblib.load("label_encoder.pkl")

# Load your data (adjust path if needed)
df = pd.read_csv("../../sp500_master_features.csv")

# Use the same features as training
top_features = feature_importance_order[:int(best_N)]  # best_N from your previous cell
df_eval = df.dropna(subset=top_features + [target, ticker_col]).copy()
df_eval['ticker_encoded'] = le.transform(df_eval[ticker_col])

# Prepare test set (20% split)
from sklearn.model_selection import train_test_split
X = pd.concat([df_eval[top_features], df_eval[['ticker_encoded']]], axis=1)
y = df_eval[target]
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²: {r2:.4f}")

MemoryError: could not allocate 132071616 bytes