In [62]:
#Parameters
# --- Imports ---
import yfinance as yf
import pandas as pd
import yfinance as yf
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Parameters ---
ticker = "SPY"
period = "max"
n = 14 # days ahead to predict
train_ratio = 0.8



In [63]:
#Functions
# --- Label Generator ---
def future_positive_return_labels(close_prices, n=1):
    """
    Generate binary labels for whether the price is higher n days later,
    but mark NaN where future data is unavailable.
    """
    # Price n days ahead
    future_close = close_prices.shift(-n)
    
    # % return from today to n days later
    future_return = (future_close - close_prices) / close_prices
    
    # 1 if positive return, 0 if not
    labels = (future_return > 0).astype("float")  # float so we can have NaN
    
    # Explicitly set last n rows to NaN (no future data)
    labels.iloc[-n:] = float("nan")
    
    return labels


def create_lagged_returns(return_series, lags):
    """
    Create lagged return features from a return series.
    
    Parameters:
    - return_series: pd.Series of returns (e.g., daily returns)
    - lags: list of integers representing lag days
    
    Returns:
    - pd.DataFrame with lagged return columns named 'return_lag_{lag}'
    """
    df = pd.DataFrame(index=return_series.index)
    for lag in lags:
        df[f'return_lag_{lag}'] = return_series.shift(lag)
    return df

def moving_average_regime(close, short_window=50, long_window=200):
    """
    Detect regime based on moving average crossover.
    
    Returns a binary Series: 1 = uptrend (short SMA > long SMA), 0 = downtrend.
    """
    sma_short = close.rolling(window=short_window).mean()
    sma_long = close.rolling(window=long_window).mean()
    regime = (sma_short > sma_long).astype(int)
    return regime

def rsi(close, window=14):
    """
    Compute the Relative Strength Index (RSI) for a given window.
    """
    delta = close.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def create_features_improved(open_, high, low, close, volume):
    df = pd.DataFrame(index=close.index)
    
    periods = range(1,50) # 1 for daily returns, plus others

    for period in periods:
        if period == 1:
            df['return'] = close.pct_change()
        else:
            df[f'return_{period}'] = close.pct_change(period)
        
    # Daily price ranges
    df['high_low_range'] = (high - low) / close
    df['close_open_diff'] = (close - open_) / open_
    
    
    # Volume related features
    df['volume'] = volume
    df['volume_change'] = volume.pct_change()
    df['volume_change_5'] = volume.pct_change(5)
    df['volume_ma_5'] = volume.rolling(window=5).mean()
    df['volume_ma_10'] = volume.rolling(window=10).mean()
    df['volume_ma_20'] = volume.rolling(window=20).mean()

    
    
    # Lagged returns
    lags = range(1, n)
    lagged_returns_df = create_lagged_returns(df['return'], lags)
    df = pd.concat([df, lagged_returns_df], axis=1)
    
    # Regime features
    df['trend_regime_5_21_crossover'] = moving_average_regime(close, short_window=5, long_window=21)
    df['trend_regime_21_50_crossover'] = moving_average_regime(close, short_window=21, long_window=50)
    df['trend_regime_50_200_crossover'] = moving_average_regime(close)
    
    # Time features
    df['day_of_week'] = close.index.dayofweek
    df['day_of_month'] = close.index.day
    df['week_of_year'] = close.index.isocalendar().week
    df['year'] = close.index.year
    df['quarter'] = close.index.quarter
    df['week_of_month'] = (close.index.day - 1) // 7 + 1
    df['month'] = close.index.month
    for window in range(1, 51):
        df[f'rsi_{window}'] = rsi(close, window=window)
        
    # Shift all features forward by n days so features at time t predict returns at t + n
    df = df.shift(n)
    
    df = df.dropna()
    return df

# Baseline features: lagged returns only
def create_features_baseline(close_prices):
    df = pd.DataFrame({'close': close_prices})
    for lag in range(1, 4):
        df[f'return_lag_{lag}'] = close_prices.pct_change(lag)
    df = df.dropna()
    return df

In [64]:
#Feature Engineering and Label Creation

data = yf.download(ticker, period=period, auto_adjust=True)

# Extract OHLCV
open_prices = data['Open']
close_prices = data['Close']
high = data['High']
low = data['Low']
volume = data['Volume']

# Business day reindex with forward fill
full_range = pd.date_range(start=close_prices.index.min(), end=close_prices.index.max(), freq='B')
open_prices_bdays = open_prices.reindex(full_range).ffill()
close_prices_bdays = close_prices.reindex(full_range).ffill()
high_bdays = high.reindex(full_range).ffill()
low_bdays = low.reindex(full_range).ffill()
volume_bdays = volume.reindex(full_range).ffill()

def ensure_series(x):
    if isinstance(x, pd.DataFrame):
        return x.iloc[:, 0]
    else:
        return x

close_prices_series = ensure_series(close_prices_bdays)
high_series = ensure_series(high_bdays)
low_series = ensure_series(low_bdays)
open_series = ensure_series(open_prices_bdays)
volume_series = ensure_series(volume_bdays)

# Create features
features_improved = create_features_improved(close_prices_series, high_series, low_series, open_series, volume_series)
features_baseline = create_features_baseline(close_prices_series)

# 1. Create labels (with NaNs for missing future data)
labels_n_days = future_positive_return_labels(close_prices_series, n=n)

# 2. Drop NaN labels upfront (removes last n rows with no future price)
labels_n_days = labels_n_days.dropna()
print(labels_n_days.value_counts(normalize=True))
# ---- FIX FOR IMPROVED FEATURES ----
# Instead of dropna(), slice from first valid index to keep valid rows after lag windows
first_valid_idx = features_improved.first_valid_index()
features_improved_cleaned = features_improved.loc[first_valid_idx:]

# Now align features and labels for improved features
common_index_improved = features_improved_cleaned.index.intersection(labels_n_days.index)
features_improved_aligned = features_improved_cleaned.loc[common_index_improved]
labels_improved_aligned = labels_n_days.loc[common_index_improved]
#make sure its a pandas series


# For baseline features, just drop NaNs as usual (likely few NaNs)
features_baseline_clean = features_baseline.dropna()
common_index_baseline = features_baseline_clean.index.intersection(labels_n_days.index)
features_baseline_aligned = features_baseline_clean.loc[common_index_baseline]
labels_baseline_aligned = labels_n_days.loc[common_index_baseline]

[*********************100%***********************]  1 of 1 completed


SPY
1.0    0.634325
0.0    0.365675
Name: proportion, dtype: float64


In [65]:
#Split data
split_idx_improved = int(len(labels_improved_aligned) * train_ratio)
X_train_improved = features_improved_aligned.iloc[:split_idx_improved]
X_test_improved = features_improved_aligned.iloc[split_idx_improved:]
y_train_improved = labels_improved_aligned.iloc[:split_idx_improved]
y_test_improved = labels_improved_aligned.iloc[split_idx_improved:]

split_idx_baseline = int(len(labels_baseline_aligned) * train_ratio)
X_train_baseline = features_baseline_aligned.iloc[:split_idx_baseline]
X_test_baseline = features_baseline_aligned.iloc[split_idx_baseline:]
y_train_baseline = labels_baseline_aligned.iloc[:split_idx_baseline]
y_test_baseline = labels_baseline_aligned.iloc[split_idx_baseline:]

print(f"Training samples improved: {len(X_train_improved)}")
print(f"Training samples baseline: {len(X_train_baseline)}")

print(f"NaNs in improved features after slicing from first valid index: {features_improved_aligned.isna().sum().sum()}")
print(f"Shape of improved features after slicing and alignment: {features_improved_aligned.shape}")

print(f"Length of labels_n_days after dropna: {len(labels_n_days)}")
print(f"Length of features_improved before cleaning: {len(features_improved)}")
print(f"Length of features_improved after cleaning: {len(features_improved_cleaned)}")
print(f"Length of features_improved after alignment: {len(features_improved_aligned)}")

print(f"First 5 indices of labels: {labels_n_days.index[:5]}")
print(f"First 5 indices of features_improved: {features_improved.index[:5]}")
print(f"First 5 indices of features_improved_aligned: {features_improved_aligned.index[:5]}")

display(features_improved.head(10))
display(features_improved.tail(10))  

Training samples improved: 6489
Training samples baseline: 6775
NaNs in improved features after slicing from first valid index: 0
Shape of improved features after slicing and alignment: (8112, 130)
Length of labels_n_days after dropna: 8472
Length of features_improved before cleaning: 8125
Length of features_improved after cleaning: 8125
Length of features_improved after alignment: 8112
First 5 indices of labels: DatetimeIndex(['1993-01-29', '1993-02-01', '1993-02-02', '1993-02-03',
               '1993-02-04'],
              dtype='datetime64[ns]', freq='B')
First 5 indices of features_improved: DatetimeIndex(['1993-04-28', '1993-04-30', '1993-05-03', '1993-05-04',
               '1993-05-05'],
              dtype='datetime64[ns]', freq=None)
First 5 indices of features_improved_aligned: DatetimeIndex(['1993-04-28', '1993-04-30', '1993-05-03', '1993-05-04',
               '1993-05-05'],
              dtype='datetime64[ns]', freq=None)


Unnamed: 0,return,return_2,return_3,return_4,return_5,return_6,return_7,return_8,return_9,return_10,...,rsi_41,rsi_42,rsi_43,rsi_44,rsi_45,rsi_46,rsi_47,rsi_48,rsi_49,rsi_50
1993-04-28,0.006357,0.002815,0.002111,-0.002798,-0.015883,-0.017918,-0.01384,-0.009039,-0.013157,-0.008351,...,50.382237,49.686952,49.010738,49.010738,49.010743,51.392388,52.137623,53.096385,53.096385,53.096385
1993-04-30,0.00421,0.00421,0.010593,0.007037,0.006329,0.0014,-0.01174,-0.013784,-0.009688,-0.004867,...,50.24671,50.662082,51.205246,50.510166,49.833845,49.833845,49.83385,52.140963,52.863591,53.793772
1993-05-03,0.003494,0.007718,0.007718,0.014124,0.010555,0.009845,0.004898,-0.008287,-0.010338,-0.006228,...,50.938231,50.938231,51.342184,51.870557,51.175918,50.499779,50.499779,50.499783,52.747423,53.451988
1993-05-04,0.003482,0.006988,0.011228,0.011228,0.017656,0.014075,0.013362,0.008398,-0.004834,-0.006892,...,53.509325,51.610927,51.610927,52.003922,52.518098,51.824124,51.148392,51.148392,51.148397,53.338823
1993-05-05,-0.002776,0.000697,0.004193,0.008421,0.008421,0.014831,0.01126,0.010549,0.005599,-0.007596,...,58.109945,52.907708,51.051017,51.051017,51.444282,51.958914,51.279544,50.617849,50.617849,50.617853
1993-05-06,0.001392,-0.001388,0.00209,0.005591,0.009824,0.009824,0.016243,0.012667,0.011955,0.006998,...,56.368309,58.367002,53.170963,51.3151,51.3151,51.70415,52.213317,51.534189,50.872639,50.872639
1993-05-07,-0.000695,0.000696,-0.002082,0.001393,0.004892,0.009122,0.009122,0.015537,0.011963,0.011252,...,59.409557,56.187664,58.188495,53.022783,51.177071,51.177071,51.566182,52.075456,51.399886,50.741758
1993-05-10,-0.005564,-0.006255,-0.004872,-0.007634,-0.004178,-0.000699,0.003508,0.003508,0.009887,0.006333,...,56.557252,57.84124,54.782829,56.798488,51.866166,50.098756,50.098756,50.488147,50.997985,50.349908
1993-05-11,-0.001399,-0.006955,-0.007645,-0.006264,-0.009022,-0.005571,-0.002097,0.002104,0.002104,0.008474,...,55.426103,56.175135,57.461923,54.442447,56.46122,51.584785,49.836177,49.836177,50.225581,50.735485
1993-05-12,-0.007002,-0.008392,-0.013909,-0.014594,-0.013222,-0.015962,-0.012535,-0.009085,-0.004913,-0.004913,...,54.303483,53.585633,54.340114,55.638233,52.802647,54.833801,50.22295,48.563965,48.563965,48.953129


Unnamed: 0,return,return_2,return_3,return_4,return_5,return_6,return_7,return_8,return_9,return_10,...,rsi_41,rsi_42,rsi_43,rsi_44,rsi_45,rsi_46,rsi_47,rsi_48,rsi_49,rsi_50
2025-07-28,-0.003224,-0.001767,-0.001767,0.006659,0.008096,0.00643,0.01382,0.020296,0.022109,0.028163,...,66.631529,70.194039,70.454957,71.47983,71.90489,69.7327,68.765665,69.616179,71.865333,71.312726
2025-07-29,0.002285,-0.000946,0.000514,0.000514,0.008959,0.0104,0.00873,0.016137,0.022628,0.024444,...,66.493738,67.006214,70.493349,70.749075,71.753992,72.170978,70.010782,69.048688,69.884064,72.095176
2025-07-30,0.002296,0.004587,0.001348,0.002811,0.002811,0.011276,0.01272,0.011047,0.01847,0.024976,...,65.678757,66.874123,67.375116,70.788742,71.039395,72.024799,72.433886,70.285695,69.328557,70.149095
2025-07-31,-0.002339,-4.8e-05,0.002237,-0.000995,0.000466,0.000466,0.008911,0.010351,0.008682,0.016088,...,66.104707,64.899359,66.107863,66.61466,70.07251,70.326733,71.326598,71.74189,69.633951,68.694362
2025-08-01,0.000674,-0.001666,0.000626,0.002913,-0.000321,0.001141,0.001141,0.009591,0.011032,0.009362,...,64.563565,66.222151,65.01877,66.219205,66.722702,70.15936,70.412116,71.406332,71.819333,69.714733
2025-08-04,0.006997,0.007676,0.005319,0.007627,0.00993,0.006674,0.008145,0.008145,0.016655,0.018106,...,67.623604,65.852203,67.39498,66.212064,67.333309,67.804374,71.032083,71.270329,72.20862,72.598929
2025-08-05,-0.006024,0.000931,0.001606,-0.000737,0.001558,0.003846,0.00061,0.002072,0.002072,0.010531,...,64.017698,65.502931,63.839524,65.42549,64.310128,65.461566,65.945971,69.275566,69.522061,70.493811
2025-08-06,0.001058,-0.004972,0.00199,0.002666,0.00032,0.002617,0.004909,0.001668,0.003133,0.003133,...,66.890462,64.222046,65.690801,64.031478,65.601016,64.488244,65.628401,66.108171,69.40766,69.652048
2025-08-07,0.007847,0.008914,0.002837,0.009853,0.010534,0.00817,0.010485,0.012795,0.009529,0.011005,...,71.8373,68.28322,65.669441,67.024013,65.395203,66.850389,65.756935,66.818299,67.265642,70.35388
2025-08-08,-0.000842,0.006999,0.008064,0.001992,0.009003,0.009683,0.007321,0.009634,0.011942,0.008679,...,76.047838,71.495091,67.973961,65.383356,66.743499,65.12813,66.588809,65.503826,66.569047,67.018091


In [66]:
#Feature filtering
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

# --- Check training data non-empty ---
if len(X_train_improved) == 0 or len(y_train_improved) == 0:
    raise ValueError("Training data for improved features is empty! Please check your feature creation or data alignment.")

if len(X_train_baseline) == 0 or len(y_train_baseline) == 0:
    raise ValueError("Training data for baseline features is empty! Please check your feature creation or data alignment.")

# --- Train models ---
model_improved = RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')
model_improved.fit(X_train_improved, y_train_improved)

model_baseline = RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')
model_baseline.fit(X_train_baseline, y_train_baseline)

# --- Predict on test sets ---
y_pred_improved = model_improved.predict(X_test_improved)
y_pred_baseline = model_baseline.predict(X_test_baseline)

# (Optional) Evaluate accuracy
print("Test accuracy with improved features:", accuracy_score(y_test_improved, y_pred_improved))
print("Test accuracy with baseline features:", accuracy_score(y_test_baseline, y_pred_baseline))

# --- Permutation importance for improved features ---
result_improved = permutation_importance(model_improved, X_test_improved, y_test_improved, n_repeats=10, random_state=42)
perm_importance_improved_df = pd.DataFrame({
    'feature': X_test_improved.columns,
    'importance_mean': result_improved.importances_mean,
    'importance_std': result_improved.importances_std
}).sort_values(by='importance_mean', ascending=False)

# --- Step 1: Tree-based feature importance ---
rf_importances = pd.Series(model_improved.feature_importances_, index=X_train_improved.columns)
tree_selected_features = rf_importances[rf_importances > 0.01].index.tolist()
print(f"Tree-based selected features: {len(tree_selected_features)}")

# --- Step 2: Permutation importance ---
perm_selected_features = perm_importance_improved_df[
    perm_importance_improved_df['importance_mean'] > 0
]['feature'].tolist()
print(f"Permutation importance selected features: {len(perm_selected_features)}")

# --- Step 3: Univariate feature selection ---
selector_uni = SelectKBest(mutual_info_classif, k=30)  # adjust k as needed
selector_uni.fit(X_train_improved.fillna(0), y_train_improved)  # Impute missing values
uni_selected_features = X_train_improved.columns[selector_uni.get_support()].tolist()
print(f"Univariate selected features: {len(uni_selected_features)}")

# --- Step 4: Combine all selected features ---
combined_features = list(set(tree_selected_features) | set(perm_selected_features) | set(uni_selected_features))
print(f"Combined features before correlation filtering: {len(combined_features)}")

# --- Step 5: Correlation filtering ---
def correlation_filter(df, features, threshold=0.9):
    selected = []
    corr_matrix = df[features].corr().abs()
    to_drop = set()
    for i in range(len(corr_matrix.columns)):
        feature = corr_matrix.columns[i]
        if feature in to_drop:
            continue
        selected.append(feature)
        correlated = corr_matrix.columns[(corr_matrix.iloc[i] > threshold)].tolist()
        if feature in correlated:
            correlated.remove(feature)
        to_drop.update(correlated)
    return selected

filtered_features = correlation_filter(X_train_improved, combined_features, threshold=0.9)
print(f"Features after correlation filtering: {len(filtered_features)}")

# --- Optional Step 6: RFE refinement ---
from sklearn.feature_selection import RFE
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(estimator, n_features_to_select=20, step=1)
selector.fit(X_train_improved[filtered_features].fillna(0), y_train_improved)
final_features = list(X_train_improved[filtered_features].columns[selector.support_])
print(f"Features after RFE selection: {len(final_features)}")

# --- Final feature set ---
final_features = filtered_features  # or replace with RFE features if used
print("Final selected features:", final_features)


Test accuracy with improved features: 0.678373382624769
Test accuracy with baseline features: 0.34297520661157027
Tree-based selected features: 8
Permutation importance selected features: 100
Univariate selected features: 30
Combined features before correlation filtering: 108
Features after correlation filtering: 46
Features after RFE selection: 20
Final selected features: ['volume', 'return_31', 'year', 'rsi_17', 'return_8', 'return_15', 'rsi_8', 'return_3', 'rsi_44', 'month', 'return_18', 'return_lag_8', 'return_lag_9', 'return_25', 'return_49', 'rsi_2', 'rsi_21', 'rsi_5', 'volume_change_5', 'rsi_27', 'rsi_14', 'high_low_range', 'rsi_3', 'return_41', 'week_of_month', 'return_lag_10', 'trend_regime_5_21_crossover', 'rsi_35', 'return_lag_4', 'trend_regime_50_200_crossover', 'return', 'return_11', 'return_6', 'return_lag_6', 'rsi_1', 'return_4', 'rsi_6', 'return_lag_3', 'rsi_10', 'volume_change', 'return_lag_12', 'day_of_week', 'return_lag_13', 'rsi_4', 'return_2', 'return_lag_2']


In [67]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare training and test data with final features
X_train_final = X_train_improved[final_features].fillna(0)
X_test_final = X_test_improved[final_features].fillna(0)

# Initialize and train the model
model_final = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)
model_final.fit(X_train_final, y_train_improved)

# Predict probabilities instead of class labels
y_proba_final = model_final.predict_proba(X_test_final)[:, 1]

# Sweep thresholds
thresholds = np.arange(0.5, 0.91, 0.05)

# Initialize lists for plotting
accuracies = []
precisions = []
recalls = []
f1_scores = []

best_f1 = -1
best_f1_threshold = None
best_f1_predictions = None

best_prec = -1
best_prec_threshold = None
best_prec_predictions = None

print("Threshold | Accuracy | Precision | Recall | F1 Score")
print("----------------------------------------------------")

for threshold in thresholds:
    y_pred = (y_proba_final >= threshold).astype(int)
    acc = accuracy_score(y_test_improved, y_pred)
    prec = precision_score(y_test_improved, y_pred, zero_division=0)
    rec = recall_score(y_test_improved, y_pred, zero_division=0)
    f1 = f1_score(y_test_improved, y_pred, zero_division=0)

    # Append to lists for plotting
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

    print(f"{threshold:.2f}     | {acc:.4f}   | {prec:.4f}   | {rec:.4f}   | {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_f1_threshold = threshold
        best_f1_predictions = y_pred.copy()

    if prec > best_prec:
        best_prec = prec
        best_prec_threshold = threshold
        best_prec_predictions = y_pred.copy()

print(f"\nBest F1 threshold: {best_f1_threshold:.2f} with F1 {best_f1:.4f}")
print(f"Best Precision threshold: {best_prec_threshold:.2f} with Precision {best_prec:.4f}")

# Plot the threshold, accuracy, precision, recall and f1 score
figure = go.Figure()

# Add traces for each metric
figure.add_trace(go.Scatter(x=thresholds, y=accuracies, mode='lines+markers', name='Accuracy'))
figure.add_trace(go.Scatter(x=thresholds, y=precisions, mode='lines+markers', name='Precision'))
figure.add_trace(go.Scatter(x=thresholds, y=recalls, mode='lines+markers', name='Recall'))
figure.add_trace(go.Scatter(x=thresholds, y=f1_scores, mode='lines+markers', name='F1 Score'))

# Update layout
figure.update_layout(
    title='Model Performance Metrics vs. Classification Threshold',
    xaxis_title='Classification Threshold',
    yaxis_title='Score',
    legend_title='Metrics',
    template='plotly_white'
)

figure.show()


Threshold | Accuracy | Precision | Recall | F1 Score
----------------------------------------------------
0.50     | 0.6765   | 0.6862   | 0.9749   | 0.8055
0.55     | 0.6673   | 0.6939   | 0.9229   | 0.7921
0.60     | 0.5983   | 0.6984   | 0.7309   | 0.7143
0.65     | 0.5034   | 0.7102   | 0.4682   | 0.5643
0.70     | 0.3900   | 0.7049   | 0.1928   | 0.3028
0.75     | 0.3426   | 0.7927   | 0.0583   | 0.1086
0.80     | 0.3167   | 0.7500   | 0.0081   | 0.0160
0.85     | 0.3130   | 0.0000   | 0.0000   | 0.0000
0.90     | 0.3130   | 0.0000   | 0.0000   | 0.0000

Best F1 threshold: 0.50 with F1 0.8055
Best Precision threshold: 0.75 with Precision 0.7927


In [68]:

def plot_actual_vs_predicted(dates, actual, predicted, title):
    plot_df = pd.DataFrame({
        'Date': dates,
        'Actual': actual,
        'Predicted': predicted
    })

    fig = go.Figure()

    # Actual bars
    fig.add_trace(go.Bar(
        x=plot_df['Date'],
        y=plot_df['Actual'],
        name='Actual',
        marker_color='blue',
        opacity=0.7,
        hovertemplate='Date: %{x}<br>Actual: %{y}<extra></extra>'
    ))

    # Predicted dots
    fig.add_trace(go.Scatter(
        x=plot_df['Date'],
        y=plot_df['Predicted'],
        mode='markers',
        name='Predicted',
        marker=dict(color='red', size=8, symbol='circle'),
        hovertemplate='Date: %{x}<br>Predicted: %{y}<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title='Label',
        yaxis=dict(tickvals=[0, 1], ticktext=['0 (Negative)', '1 (Positive)']),
        height=500,
        legend=dict(y=0.99, x=0.01)
    )
    fig.show()

# Plot for best F1 threshold
plot_actual_vs_predicted(
    y_test_improved.index,
    y_test_improved,
    best_f1_predictions,
    f"Actual vs Predicted Labels (Best F1 = {best_f1:.4f} at threshold {best_f1_threshold:.2f})"
)

# Plot for best Precision threshold
plot_actual_vs_predicted(
    y_test_improved.index,
    y_test_improved,
    best_prec_predictions,
    f"Actual vs Predicted Labels (Best Precision = {best_prec:.4f} at threshold {best_prec_threshold:.2f})"
)
