In [None]:
#########################################################################
#         Additional Modeling (Random Forest + Plot, LSTM)       #
#########################################################################

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# For the LSTM portion
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

def bonus_random_forest_plot(df, target_stock="AAPL", extra_features=False, out_dir="../results"):
    """
    Train & plot a RandomForest regression on (OFI + optional extra features) -> next-minute returns of 'target_stock'.
    Saves a scatter-plot of predicted vs actual returns.
    """
    ret_col    = f"log_ret_{target_stock}"
    ofi_cols   = [c for c in df.columns if c.startswith("OFI_PCA_")]

    # SHIFT each OFI col by 1 minute
    X = pd.DataFrame(index=df.index)
    for c in ofi_cols:
        X[c+"_lag1"] = df[c].shift(1)

    if extra_features:
        # Example: add mid_price_{target_stock} as a feature, also lagged
        price_col = "mid_price"
        if price_col in df.columns:
            X[f"{price_col}_lag1"] = df[price_col].shift(1)

    # Build target
    y = df[ret_col]

    # Drop NaNs
    tmp = pd.concat([X, y], axis=1).dropna()
    if len(tmp) < 50:
        print(f"[RandomForest] Not enough data after dropna. Skipping.")
        return

    X_rf = tmp.drop(columns=[ret_col])
    y_rf = tmp[ret_col]

    # Train/test split
    split_idx = int(len(tmp)*0.8)
    X_train, X_test = X_rf.iloc[:split_idx], X_rf.iloc[split_idx:]
    y_train, y_test = y_rf.iloc[:split_idx], y_rf.iloc[split_idx:]

    # Fit a random forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Predict & Evaluate
    y_pred = rf.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"[RandomForest] {target_stock}, extra_features={extra_features}, R2={r2:.4f}")

    print("Feature Importances:")
    for col, imp in zip(X_rf.columns, rf.feature_importances_):
        print(f"  {col}: {imp:.4f}")

    # Plot predicted vs actual
    plt.figure(figsize=(6,5))
    plt.scatter(y_test, y_pred, alpha=0.3, label="RF Predictions")
    plt.axhline(0, color="black", lw=1)
    plt.axvline(0, color="black", lw=1)
    plt.xlabel("Actual Returns")
    plt.ylabel("Predicted Returns")
    plt.title(f"RF Pred vs Actual, {target_stock}, R2={r2:.4f}")
    plt.legend()
    out_png = os.path.join(out_dir, f"rf_scatter_{target_stock}_features={extra_features}.png")
    plt.savefig(out_png)
    plt.close()
    print(f"Saved RandomForest scatter plot to {out_png}")


#############################
# LSTM Example with SIGN-based accuracy
#############################

# 1) Define a custom sign-accuracy metric:
def sign_accuracy_metr(y_true, y_pred):
    """
    Returns the fraction of matching signs between y_true and y_pred.
    For regression, we treat sign(y) in { -1, 0, +1 } (0 is for exact zero).
    """
    y_true_sign = tf.sign(y_true)  # -1.0, 0.0, +1.0
    y_pred_sign = tf.sign(y_pred)
    equals = tf.cast(tf.equal(y_true_sign, y_pred_sign), tf.float32)
    return tf.reduce_mean(equals)

def bonus_lstm_example(df, target_stock="AAPL", out_dir="../results"):
    """
    Minimal LSTM example: we try to predict log_ret_{target_stock}(t) from [OFI_PCA_x(t-1), etc].
    We'll do a single-step approach with a rolling window for the LSTM, plus a sign-based accuracy metric.
    """
    ret_col  = f"log_ret_{target_stock}"
    ofi_cols = [c for c in df.columns if c.startswith("OFI_PCA_")]

    TIME_STEPS = 5

    # Build features
    feature_df = pd.DataFrame(index=df.index)
    for c in ofi_cols:
        feature_df[c] = df[c].fillna(0)

    if "mid_price" in df.columns:
        feature_df["mid_price"] = df["mid_price"].fillna(method="ffill").fillna(0)

    for col in feature_df.columns:
        feature_df[col+"_lag1"] = feature_df[col].shift(1)

    feature_cols = [col+"_lag1" for col in ofi_cols]
    if "mid_price_lag1" in feature_df.columns:
        feature_cols.append("mid_price_lag1")

    y = df[ret_col].copy()

    # Drop NaNs
    tmp = pd.concat([feature_df[feature_cols], y], axis=1).dropna()

    # 2) Rolling window of length TIME_STEPS
    X_list = []
    y_list = []
    for i in range(TIME_STEPS, len(tmp)):
        X_window = tmp.iloc[i - TIME_STEPS : i][feature_cols].values
        y_val = tmp.iloc[i][ret_col]
        X_list.append(X_window)
        y_list.append(y_val)

    X_lstm = np.array(X_list)  # (samples, TIME_STEPS, n_feats)
    y_lstm = np.array(y_list)
    if len(X_lstm) < 50:
        print("[LSTM] Not enough data. Skipping.")
        return

    # 3) Train/test split
    split_idx = int(len(X_lstm)*0.8)
    X_train, X_test = X_lstm[:split_idx], X_lstm[split_idx:]
    y_train, y_test = y_lstm[:split_idx], y_lstm[split_idx:]

    # 4) Scale
    n_samples, ts, n_feats = X_train.shape
    X_train_2d = X_train.reshape(n_samples*ts, n_feats)
    X_test_2d  = X_test.reshape(len(X_test)*ts, n_feats)

    scaler = StandardScaler()
    X_train_2d_scaled = scaler.fit_transform(X_train_2d)
    X_test_2d_scaled  = scaler.transform(X_test_2d)

    X_train_scaled = X_train_2d_scaled.reshape(n_samples, ts, n_feats)
    X_test_scaled  = X_test_2d_scaled.reshape(len(X_test), ts, n_feats)

    # 5) Build LSTM
    tf.keras.backend.clear_session()
    model = Sequential()
    model.add(LSTM(16, activation="tanh", input_shape=(ts, n_feats)))
    model.add(Dense(1))
    # Compile with MSE loss, plus a custom sign_accuracy_metr
    model.compile(optimizer="adam", loss="mse", metrics=[sign_accuracy_metr])

    history = model.fit(
        X_train_scaled, y_train,
        validation_data=(X_test_scaled, y_test),
        epochs=10, batch_size=32, verbose=1
    )

    # Evaluate
    y_pred = model.predict(X_test_scaled).flatten()
    r2 = r2_score(y_test, y_pred)
    print(f"[LSTM] {target_stock} R2={r2:.4f}")

    # 6) Plot predicted vs. actual
    plt.figure(figsize=(7,5))
    plt.scatter(y_test, y_pred, alpha=0.3, label="LSTM Predictions")
    plt.axhline(0, color="black", lw=1)
    plt.axvline(0, color="black", lw=1)
    plt.xlabel("Actual Returns")
    plt.ylabel("Predicted Returns")
    plt.title(f"LSTM Pred vs Actual, {target_stock}, R2={r2:.4f}")
    plt.legend()
    out_png = os.path.join(out_dir, f"lstm_scatter_{target_stock}.png")
    plt.savefig(out_png)
    plt.close()
    print(f"Saved LSTM scatter plot to {out_png}")

    # 7) Plot training curve for LOSS
    plt.figure()
    plt.plot(history.history["loss"], label="train_loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.title(f"LSTM Training Curve (Loss) {target_stock}")
    plt.legend()
    out_png2 = os.path.join(out_dir, f"lstm_training_loss_{target_stock}.png")
    plt.savefig(out_png2)
    plt.close()
    print(f"Saved LSTM training loss curve to {out_png2}")

    # 8) Plot sign-based ACCURACY
    # We stored it in history.history["sign_accuracy_metr"] and ["val_sign_accuracy_metr"]
    train_acc_key = "sign_accuracy_metr"
    val_acc_key   = "val_sign_accuracy_metr"
    if train_acc_key in history.history and val_acc_key in history.history:
        plt.figure()
        plt.plot(history.history[train_acc_key], label="train_sign_acc")
        plt.plot(history.history[val_acc_key],   label="val_sign_acc")
        plt.title(f"LSTM Training Curve (Sign Accuracy) {target_stock}")
        plt.legend()
        out_png3 = os.path.join(out_dir, f"lstm_training_acc_{target_stock}.png")
        plt.savefig(out_png3)
        plt.close()
        print(f"Saved LSTM training sign-accuracy curve to {out_png3}")
    else:
        print("Sign accuracy keys not found in history; skipping accuracy plot.")


def run_bonus_models(df_cross, out_dir="../results"):
    """
    Demonstrate calling the RandomForest + LSTM code for a chosen stock (e.g. AAPL).
    """
    # 1) Random Forest with/without extra features
    bonus_random_forest_plot(df_cross, target_stock="AAPL", extra_features=False, out_dir=out_dir)
    bonus_random_forest_plot(df_cross, target_stock="AAPL", extra_features=True,  out_dir=out_dir)

    # 2) LSTM example
    bonus_lstm_example(df_cross, target_stock="AAPL", out_dir=out_dir)

run_bonus_models(df_cross, out_dir=RESULTS_PATH)

  and should_run_async(code)


[RandomForest] AAPL, extra_features=False, R2=-0.1635
Feature Importances:
  OFI_PCA_AAPL_lag1: 0.2634
  OFI_PCA_AMGN_lag1: 0.1471
  OFI_PCA_TSLA_lag1: 0.1950
  OFI_PCA_JPM_lag1: 0.1850
  OFI_PCA_XOM_lag1: 0.2095
Saved RandomForest scatter plot to ../results/rf_scatter_AAPL_features=False.png
[RandomForest] AAPL, extra_features=True, R2=-0.1635
Feature Importances:
  OFI_PCA_AAPL_lag1: 0.2634
  OFI_PCA_AMGN_lag1: 0.1471
  OFI_PCA_TSLA_lag1: 0.1950
  OFI_PCA_JPM_lag1: 0.1850
  OFI_PCA_XOM_lag1: 0.2095
Saved RandomForest scatter plot to ../results/rf_scatter_AAPL_features=True.png


  super().__init__(**kwargs)


Epoch 1/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 0.0225 - sign_accuracy_metr: 0.2787 - val_loss: 0.0073 - val_sign_accuracy_metr: 0.3850
Epoch 2/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0014 - sign_accuracy_metr: 0.2800 - val_loss: 0.0034 - val_sign_accuracy_metr: 0.3867
Epoch 3/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 7.8907e-04 - sign_accuracy_metr: 0.2753 - val_loss: 0.0023 - val_sign_accuracy_metr: 0.3825
Epoch 4/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 5.5621e-04 - sign_accuracy_metr: 0.2806 - val_loss: 0.0018 - val_sign_accuracy_metr: 0.3841
Epoch 5/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 4.1474e-04 - sign_accuracy_metr: 0.2875 - val_loss: 0.0015 - val_sign_accuracy_metr: 0.3802
Epoch 6/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5