In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def fetch_data(tickers, start='2009-05-01', end='2025-01-01'):
    data_dict = {}
    for ticker in tickers:
        print(f"Fetching {ticker}...")
        df = yf.download(ticker, start=start, end=end)
        df.dropna(inplace=True)
        df['SMA50'] = df['Close'].rolling(window=50).mean()
        df['SMA200'] = df['Close'].rolling(window=200).mean()
        data_dict[ticker] = df
    return data_dict

In [None]:
data_dict = fetch_data(['AAPL', 'JPM', 'XOM'])

Fetching AAPL...


  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)


Fetching JPM...


[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start, end=end)


Fetching XOM...


[*********************100%***********************]  1 of 1 completed


In [None]:
def add_indicators(df, ticker):
  # RSI
  delta = df[f'Close_{ticker}'].diff()
  gain = delta.clip(lower=0)
  loss = -delta.clip(upper=0)
  avg_gain = gain.rolling(window=14).mean()
  avg_loss = loss.rolling(window=14).mean()
  rs = avg_gain / avg_loss
  df['RSI'] = 100 - (100 / (1 + rs))

  # Bollinger Bands
  rolling_mean = df[f'Close_{ticker}'].rolling(window=20).mean()
  rolling_std = df[f'Close_{ticker}'].rolling(window=20).std()
  df['Bollinger_Upper'] = rolling_mean + (rolling_std * 2)
  df['Bollinger_Lower'] = rolling_mean - (rolling_std * 2)

  return df

In [None]:
# Flattening the column names

for ticker, df in data_dict.items():
    df.columns = [col[0] if col[1] == '' else f"{col[0]}_{col[1]}" for col in df.columns]
    data_dict[ticker] = df

In [None]:
data_dict['XOM']

Unnamed: 0_level_0,Close_XOM,High_XOM,Low_XOM,Open_XOM,Volume_XOM,SMA50,SMA200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-05-01,37.262051,37.289443,36.204621,36.779904,27385000,,
2009-05-04,37.366161,37.755168,37.075780,37.530530,27306100,,
2009-05-05,37.064804,37.431890,36.796339,37.300397,19910300,,
2009-05-06,37.574360,37.585316,36.949765,37.278498,30814800,,
2009-05-07,37.766117,37.826385,37.048381,37.804469,32541600,,
...,...,...,...,...,...,...,...
2024-12-24,104.494308,105.270160,103.806841,104.612154,7807000,114.108599,112.693227
2024-12-26,104.582695,105.113024,104.042550,104.612156,9652400,113.855630,112.697006
2024-12-27,104.572876,106.055825,103.875586,104.396100,11943900,113.596425,112.694937
2024-12-30,103.865776,104.651443,103.620254,104.396105,11080800,113.329117,112.680034


In [None]:
for ticker, df in data_dict.items():
  data_dict[ticker] = add_indicators(df, ticker)

In [None]:
# Clipping the first 200 rows to get a non null dataset in moving averages
clip_rows = 200

for ticker, df in data_dict.items():
    # Drop the first 200 rows by slicing from row index 200 till end
    clipped_df = df.iloc[clip_rows:].copy()

    # Reset index if needed (optional, but often helpful)
    clipped_df.reset_index(drop=True, inplace=True)

    # Update back to dictionary
    data_dict[ticker] = clipped_df

In [None]:
# Adding more features, that weren't present in the rule-based engine

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


def add_technical_features(df, ticker):
    """
    REPLACES: Your original add_technical_features() function
    LOCATION: Use this instead of your current technical features function
    """
    close = df[f'Close_{ticker}']
    high = df[f'High_{ticker}']
    low = df[f'Low_{ticker}']
    volume = df[f'Volume_{ticker}']

    # MACD and Signal Line - These are fine as they use past data
    exp12 = close.ewm(span=12, adjust=False).mean()
    exp26 = close.ewm(span=26, adjust=False).mean()
    df['MACD'] = exp12 - exp26
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # Average True Range (ATR) - Fixed to use previous close properly
    df['H-L'] = high - low
    df['H-PC'] = abs(high - close.shift(1))
    df['L-PC'] = abs(low - close.shift(1))
    df['TR'] = df[['H-L', 'H-PC', 'L-PC']].max(axis=1)
    df['ATR'] = df['TR'].rolling(window=14).mean()

    # On-Balance Volume (OBV) - Vectorized and more efficient
    df['Price_Change'] = np.where(close > close.shift(1), 1,
                                 np.where(close < close.shift(1), -1, 0))
    df['OBV'] = (df['Price_Change'] * volume).cumsum()

    # Daily returns - Use previous day's return as feature
    df['Daily_Return'] = close.pct_change()
    df['Prev_Daily_Return'] = df['Daily_Return'].shift(1)

    # Rolling volatility (20-day std) - This is correct
    df['Rolling_Volatility'] = df['Daily_Return'].rolling(window=20).std()

    # Momentum (10-day price diff) - This is correct
    df['Momentum_10'] = close - close.shift(10)

    # Add some additional useful features
    # RSI (Relative Strength Index)
    delta = close.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    # Bollinger Bands
    df['BB_Middle'] = close.rolling(window=20).mean()
    bb_std = close.rolling(window=20).std()
    df['BB_Upper'] = df['BB_Middle'] + (bb_std * 2)
    df['BB_Lower'] = df['BB_Middle'] - (bb_std * 2)
    df['BB_Position'] = (close - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])

    # Drop intermediate columns
    df.drop(columns=['H-L', 'H-PC', 'L-PC', 'TR', 'Price_Change'], inplace=True)

    return df


In [None]:
for ticker, df in data_dict.items():
    data_dict[ticker] = add_technical_features(df, ticker)

In [None]:
for ticker in data_dict:
  print(data_dict[ticker].isna().sum())

Close_AAPL             0
High_AAPL              0
Low_AAPL               0
Open_AAPL              0
Volume_AAPL            0
SMA50                  0
SMA200                 0
RSI                   13
Bollinger_Upper        0
Bollinger_Lower        0
MACD                   0
MACD_Signal            0
ATR                   13
OBV                    0
Daily_Return           1
Prev_Daily_Return      2
Rolling_Volatility    20
Momentum_10           10
BB_Middle             19
BB_Upper              19
BB_Lower              19
BB_Position           19
dtype: int64
Close_JPM              0
High_JPM               0
Low_JPM                0
Open_JPM               0
Volume_JPM             0
SMA50                  0
SMA200                 0
RSI                   13
Bollinger_Upper        0
Bollinger_Lower        0
MACD                   0
MACD_Signal            0
ATR                   13
OBV                    0
Daily_Return           1
Prev_Daily_Return      2
Rolling_Volatility    20
Momentum_10 

In [None]:
for ticker in data_dict:
    data_dict[ticker].dropna(inplace=True)

In [None]:
def add_lag_features(data_dict, lag_days=[1, 2, 3, 5, 10]):
    """
    Adds lag features for each column (per ticker) without requiring unified column names.

    Parameters:
        data_dict (dict): Dictionary of stock DataFrames (e.g., {'AAPL': df1, 'MSFT': df2, ...}).
        lag_days (list): List of lag intervals to apply.

    Returns:
        dict: Updated dictionary with lag features added.
    """
    for ticker, df in data_dict.items():
        numeric_cols = df.select_dtypes(include='number').columns

        for col in numeric_cols:
            for lag in lag_days:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)

        df.dropna(inplace=True)

    return data_dict


In [None]:
data_dict = add_lag_features(data_dict)

In [None]:
for i in data_dict['AAPL']:
  print(i)

Close_AAPL
High_AAPL
Low_AAPL
Open_AAPL
Volume_AAPL
SMA50
SMA200
RSI
Bollinger_Upper
Bollinger_Lower
MACD
MACD_Signal
ATR
OBV
Daily_Return
Prev_Daily_Return
Rolling_Volatility
Momentum_10
BB_Middle
BB_Upper
BB_Lower
BB_Position
Close_AAPL_lag_1
Close_AAPL_lag_2
Close_AAPL_lag_3
Close_AAPL_lag_5
Close_AAPL_lag_10
High_AAPL_lag_1
High_AAPL_lag_2
High_AAPL_lag_3
High_AAPL_lag_5
High_AAPL_lag_10
Low_AAPL_lag_1
Low_AAPL_lag_2
Low_AAPL_lag_3
Low_AAPL_lag_5
Low_AAPL_lag_10
Open_AAPL_lag_1
Open_AAPL_lag_2
Open_AAPL_lag_3
Open_AAPL_lag_5
Open_AAPL_lag_10
Volume_AAPL_lag_1
Volume_AAPL_lag_2
Volume_AAPL_lag_3
Volume_AAPL_lag_5
Volume_AAPL_lag_10
SMA50_lag_1
SMA50_lag_2
SMA50_lag_3
SMA50_lag_5
SMA50_lag_10
SMA200_lag_1
SMA200_lag_2
SMA200_lag_3
SMA200_lag_5
SMA200_lag_10
RSI_lag_1
RSI_lag_2
RSI_lag_3
RSI_lag_5
RSI_lag_10
Bollinger_Upper_lag_1
Bollinger_Upper_lag_2
Bollinger_Upper_lag_3
Bollinger_Upper_lag_5
Bollinger_Upper_lag_10
Bollinger_Lower_lag_1
Bollinger_Lower_lag_2
Bollinger_Lower_lag_3
Bo

In [None]:
def generate_target_class(df, price_col='Close_AAPL', horizon=5, threshold=0.01):
    """
    REPLACES: Your original generate_target_class() function
    LOCATION: Use this instead of your current target generation
    FIXES: Removes look-ahead bias
    """
    # Calculate historical returns for training (no future data used)
    df['historical_return'] = df[price_col].pct_change(periods=horizon).shift(-horizon)

    # Create target classes based on historical data
    df['target_class'] = 0
    df.loc[df['historical_return'] > threshold, 'target_class'] = 1
    df.loc[df['historical_return'] < -threshold, 'target_class'] = -1

    # Remove the last 'horizon' rows as they don't have future data for training
    df = df.iloc[:-horizon].copy()

    # Drop the helper column
    df.drop('historical_return', axis=1, inplace=True)

    return df

def create_balanced_targets(df, price_col, horizon=5):
    """
    ADD THIS: New function to create better balanced targets
    LOCATION: Add this as a new function alongside generate_target_class_fixed
    PURPOSE: Creates more balanced target classes for better model performance
    """
    # Calculate forward returns
    future_returns = df[price_col].pct_change(periods=horizon).shift(-horizon)

    # Method 1: Percentile-based targets (more balanced)
    upper_percentile = future_returns.quantile(0.75)  # Top 25%
    lower_percentile = future_returns.quantile(0.25)  # Bottom 25%

    df['target_percentile'] = 0  # Middle 50%
    df.loc[future_returns > upper_percentile, 'target_percentile'] = 1  # Top 25%
    df.loc[future_returns < lower_percentile, 'target_percentile'] = -1  # Bottom 25%

    # Method 2: Binary direction (simpler and often better)
    df['target_direction'] = np.where(future_returns > 0, 1, 0)

    # Remove last rows without future data
    df = df.iloc[:-horizon].copy()

    print("Target Distribution Comparison:")
    print(f"Percentile-based: {df['target_percentile'].value_counts().sort_index()}")
    print(f"Direction: {df['target_direction'].value_counts().sort_index()}")

    return df


In [None]:
!pip install boruta
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# ------------------ Helper Functions ------------------
def create_time_series_splits(df, train_ratio=0.6, val_ratio=0.2):
    """
    ADD THIS: New function for proper time series splitting
    LOCATION: Add this as a completely new function
    PURPOSE: Ensures chronological order in train/val/test splits
    """
    n = len(df)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()

    print(f"Train period: {train_end} samples")
    print(f"Validation period: {len(val_df)} samples")
    print(f"Test period: {len(test_df)} samples")

    return train_df, val_df, test_df

def select_features_time_series(X_train, y_train, max_features=15):
    """
    REPLACES: Your select_features_boruta_mi() function
    LOCATION: Use this instead of your current feature selection
    FIXES: Prevents data leakage in feature selection
    """
    # Remove features with too many NaN values
    na_threshold = 0.3  # Allow max 30% missing values
    valid_features = X_train.columns[X_train.isnull().mean() < na_threshold]
    X_train_clean = X_train[valid_features].fillna(method='ffill').fillna(0)

    # Remove highly correlated features to prevent multicollinearity
    corr_matrix = X_train_clean.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    X_train_clean = X_train_clean.drop(columns=high_corr_features)

    # Mutual information feature selection (only on training data)
    mi_scores = mutual_info_classif(X_train_clean, y_train, discrete_features=False, random_state=42)
    mi_series = pd.Series(mi_scores, index=X_train_clean.columns).sort_values(ascending=False)

    # Select top features based on MI scores
    selected_features = mi_series.head(max_features).index.tolist()

    print(f"Selected {len(selected_features)} features from {len(X_train.columns)} original features")
    print(f"Top 10 features: {selected_features[:10]}")

    return selected_features, mi_series


def walk_forward_validation(X, y, n_splits=5, min_train_size=252):
    """
    Walk-forward validation for time series.
    Each fold uses progressively more historical data.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=None)

    fold_results = []
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        # Ensure minimum training size
        if len(train_idx) < min_train_size:
            continue

        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]

        fold_results.append({
            'fold': fold,
            'X_train': X_train_fold,
            'X_val': X_val_fold,
            'y_train': y_train_fold,
            'y_val': y_val_fold,
            'train_period': f"{X_train_fold.index[0]} to {X_train_fold.index[-1]}",
            'val_period': f"{X_val_fold.index[0]} to {X_val_fold.index[-1]}"
        })

    return fold_results



def select_features_time_series(X_train, y_train, X_val=None, y_val=None, max_features=20):
    """
    Feature selection that prevents data leakage in time series.
    Only uses training data for selection, then applies to validation/test.
    """
    # Remove features with too many NaN values
    na_threshold = 0.3  # Allow max 30% missing values
    valid_features = X_train.columns[X_train.isnull().mean() < na_threshold]
    X_train_clean = X_train[valid_features].fillna(method='ffill').fillna(0)

    # Remove highly correlated features to prevent multicollinearity
    corr_matrix = X_train_clean.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    X_train_clean = X_train_clean.drop(columns=high_corr_features)

    # Mutual information feature selection (only on training data)
    mi_scores = mutual_info_classif(X_train_clean, y_train, discrete_features=False, random_state=42)
    mi_series = pd.Series(mi_scores, index=X_train_clean.columns).sort_values(ascending=False)

    # Select top features based on MI scores
    selected_features = mi_series.head(max_features).index.tolist()

    # Validate feature importance with Random Forest (only on training data)
    rf_selector = RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        class_weight='balanced'
    )
    rf_selector.fit(X_train_clean[selected_features], y_train)

    # Get feature importance
    feature_importance = pd.Series(
        rf_selector.feature_importances_,
        index=selected_features
    ).sort_values(ascending=False)

    # Final feature selection
    final_features = feature_importance.head(max_features).index.tolist()

    print(f"Selected {len(final_features)} features from {len(X_train.columns)} original features")
    print(f"Top 10 features: {final_features[:10]}")

    return final_features, mi_series, feature_importance

def prepare_features_for_prediction(X, selected_features):
    """
    Prepare features for prediction, handling missing values appropriately.
    """
    X_selected = X[selected_features].copy()

    # Forward fill missing values (use last known value)
    X_selected = X_selected.fillna(method='ffill')

    # If still missing values at the beginning, fill with 0
    X_selected = X_selected.fillna(0)

    return X_selected

def evaluate_model_properly(y_true, y_pred, method_name="Model"):
    """
    ADD THIS: New function for proper model evaluation
    LOCATION: Add this as a new function for better evaluation metrics
    PURPOSE: Focuses on trading-relevant metrics instead of misleading accuracy
    """
    print(f"\n📈 {method_name} Evaluation:")
    print("-" * 40)

    # Standard metrics
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    regular_acc = accuracy_score(y_true, y_pred)

    print(f"Regular Accuracy: {regular_acc:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")

    # Per-class performance
    report = classification_report(y_true, y_pred, output_dict=True)

    if '-1' in report:  # Multi-class
        print(f"DOWN moves - Precision: {report['-1']['precision']:.3f}, Recall: {report['-1']['recall']:.3f}")
        print(f"UP moves - Precision: {report['1']['precision']:.3f}, Recall: {report['1']['recall']:.3f}")

        # Trading-specific metrics
        down_f1 = report['-1']['f1-score']
        up_f1 = report['1']['f1-score']
        directional_f1 = (down_f1 + up_f1) / 2
        print(f"Average Directional F1: {directional_f1:.3f}")

    else:  # Binary
        print(f"UP moves - Precision: {report['1']['precision']:.3f}, Recall: {report['1']['recall']:.3f}")
        print(f"UP F1-Score: {report['1']['f1-score']:.3f}")

    return balanced_acc, regular_acc




In [None]:
def complete_improved_workflow(data_dict, horizon=1, use_balanced_targets=True):
    """
    REPLACES: Your entire workflow in the main execution
    LOCATION: Use this as your main execution function
    INTEGRATES: All the fixes and improvements in one place
    """
    results = {}

    for ticker, df in data_dict.items():
        print(f"\n{'='*50}")
        print(f"Processing {ticker}")
        print(f"{'='*50}")

        # Step 1: Feature Engineering (REPLACES your technical features call)
        df_features = add_technical_features(df.copy(), ticker)
        df_features = add_lag_features({ticker: df_features}, lag_days=[1, 2, 3, 5])[ticker]

        # Step 2: Target Generation (CHOICE: balanced vs original)
        if use_balanced_targets:
            df_ml = create_balanced_targets(df_features.copy(),
                                          price_col=f'Close_{ticker}',
                                          horizon=horizon)
            target_col = 'target_direction'  # Use binary direction
        else:
            df_ml = generate_target_class(df_features.copy(),
                                              price_col=f'Close_{ticker}',
                                              horizon=horizon,
                                              threshold=0.01)
            target_col = 'target_class'

        # Step 3: Clean data
        df_ml = df_ml.dropna()

        if len(df_ml) < 500:
            print(f"Insufficient data for {ticker}: {len(df_ml)} rows")
            continue

        # Step 4: Prepare features
        feature_cols = [col for col in df_ml.columns
                       if not col.startswith('target')
                       and not col.startswith('future_return')
                       and not col.startswith('historical_return')]

        X = df_ml[feature_cols]
        y = df_ml[target_col]

        print(f"Target distribution: {y.value_counts().sort_index().to_dict()}")

        # Step 5: Time Series Split (REPLACES any random splitting)
        train_df, val_df, test_df = create_time_series_splits(df_ml,
                                                            train_ratio=0.6,
                                                            val_ratio=0.2)

        X_train = train_df[feature_cols]
        y_train = train_df[target_col]
        X_val = val_df[feature_cols]
        y_val = val_df[target_col]
        X_test = test_df[feature_cols]
        y_test = test_df[target_col]

        # Step 6: Feature Selection (REPLACES your Boruta selection)
        selected_features, mi_scores = select_features_time_series(X_train, y_train, max_features=15)

        # Step 7: Prepare final datasets
        X_train_final = X_train[selected_features].fillna(method='ffill').fillna(0)
        X_val_final = X_val[selected_features].fillna(method='ffill').fillna(0)
        X_test_final = X_test[selected_features].fillna(method='ffill').fillna(0)

        # Step 8: Model Training
        model = RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            min_samples_split=20,
            min_samples_leaf=10,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )

        model.fit(X_train_final, y_train)

        # Step 9: Validation and Testing
        val_pred = model.predict(X_val_final)
        test_pred = model.predict(X_test_final)

        # Step 10: Proper Evaluation (REPLACES basic accuracy reporting)
        val_balanced_acc, val_regular_acc = evaluate_model_properly(y_val, val_pred, "Validation")
        test_balanced_acc, test_regular_acc = evaluate_model_properly(y_test, test_pred, "Test")

        # Step 11: Store Results
        results[ticker] = {
            'model': model,
            'selected_features': selected_features,
            'val_balanced_acc': val_balanced_acc,
            'val_regular_acc': val_regular_acc,
            'test_balanced_acc': test_balanced_acc,
            'test_regular_acc': test_regular_acc,
            'target_distribution': y.value_counts().to_dict(),
            'use_balanced_targets': use_balanced_targets
        }

        print(f"\n🎯 SUMMARY FOR {ticker}:")
        print(f"Validation - Regular Acc: {val_regular_acc:.4f}, Balanced Acc: {val_balanced_acc:.4f}")
        print(f"Test - Regular Acc: {test_regular_acc:.4f}, Balanced Acc: {test_balanced_acc:.4f}")

    return results

In [None]:
results = complete_improved_workflow(data_dict, horizon=1, use_balanced_targets=True)


Processing AAPL
Target Distribution Comparison:
Percentile-based: target_percentile
-1     921
 0    1841
 1     921
Name: count, dtype: int64
Direction: target_direction
0    1732
1    1951
Name: count, dtype: int64
Target distribution: {0: 1732, 1: 1951}
Train period: 2209 samples
Validation period: 737 samples
Test period: 737 samples
Selected 15 features from 572 original features
Top 10 features: ['Daily_Return_lag_2', 'OBV', 'Daily_Return_lag_2_lag_5', 'Rolling_Volatility', 'BB_Position_lag_10_lag_5', 'RSI_lag_2', 'Daily_Return_lag_3_lag_5', 'Close_AAPL', 'Momentum_10_lag_2_lag_5', 'Prev_Daily_Return_lag_10']


ValueError: too many values to unpack (expected 2)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

def complete_time_series_workflow(data_dict, horizon=5, threshold=0.01):
    """
    Complete workflow for time series stock prediction with proper validation.
    """
    results = {}

    for ticker, df in data_dict.items():
        print(f"\n{'='*50}")
        print(f"Processing {ticker}")
        print(f"{'='*50}")

        # 1. Feature Engineering (no look-ahead bias)
        df_features = add_technical_features(df.copy(), ticker)
        df_features = add_lag_features({ticker: df_features}, lag_days=[1, 2, 3, 5, 10])[ticker]

        # 2. Target Generation (no look-ahead bias)
        df_ml = generate_target_class(df_features.copy(),
                                          price_col=f'Close_{ticker}',
                                          horizon=horizon,
                                          threshold=threshold)

        # 3. Remove rows with NaN (important for time series)
        df_ml = df_ml.dropna()

        if len(df_ml) < 500:  # Need minimum data for meaningful results
            print(f"Insufficient data for {ticker}: {len(df_ml)} rows")
            continue

        # 4. Prepare features and target
        feature_cols = [col for col in df_ml.columns
                       if not col.startswith('target')
                       and not col.startswith('future_return')
                       and not col.startswith('historical_return')]

        X = df_ml[feature_cols]
        y = df_ml['target_class']

        # 5. Time Series Split (NO SHUFFLING!)
        train_df, val_df, test_df = create_time_series_splits(df_ml,
                                                            train_ratio=0.6,
                                                            val_ratio=0.2)

        X_train = train_df[feature_cols]
        y_train = train_df['target_class']
        X_val = val_df[feature_cols]
        y_val = val_df['target_class']
        X_test = test_df[feature_cols]
        y_test = test_df['target_class']

        # 6. Feature Selection (only on training data)
        selected_features, mi_scores, feature_importance = select_features_time_series(
            X_train, y_train, X_val, y_val, max_features=15
        )

        # 7. Prepare final datasets
        X_train_final = prepare_features_for_prediction(X_train, selected_features)
        X_val_final = prepare_features_for_prediction(X_val, selected_features)
        X_test_final = prepare_features_for_prediction(X_test, selected_features)

        # 8. Model Training
        model = RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            min_samples_split=20,
            min_samples_leaf=10,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )

        model.fit(X_train_final, y_train)

        # 9. Validation
        val_pred = model.predict(X_val_final)
        val_accuracy = accuracy_score(y_val, val_pred)

        # 10. Final Test (only if validation looks good)
        test_pred = model.predict(X_test_final)
        test_accuracy = accuracy_score(y_test, test_pred)

        # 11. Walk-Forward Validation for robustness
        wf_results = walk_forward_validation(X, y, n_splits=5)
        wf_accuracies = []

        for fold_data in wf_results:
            fold_model = RandomForestClassifier(
                n_estimators=100, max_depth=8,
                class_weight='balanced', random_state=42
            )

            X_fold_train = prepare_features_for_prediction(fold_data['X_train'], selected_features)
            X_fold_val = prepare_features_for_prediction(fold_data['X_val'], selected_features)

            fold_model.fit(X_fold_train, fold_data['y_train'])
            fold_pred = fold_model.predict(X_fold_val)
            fold_accuracy = accuracy_score(fold_data['y_val'], fold_pred)
            wf_accuracies.append(fold_accuracy)

        # 12. Results Summary
        results[ticker] = {
            'model': model,
            'selected_features': selected_features,
            'val_accuracy': val_accuracy,
            'test_accuracy': test_accuracy,
            'wf_mean_accuracy': np.mean(wf_accuracies),
            'wf_std_accuracy': np.std(wf_accuracies),
            'feature_importance': feature_importance,
            'data_shape': df_ml.shape,
            'class_distribution': y.value_counts().to_dict()
        }

        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Walk-Forward CV: {np.mean(wf_accuracies):.4f} ± {np.std(wf_accuracies):.4f}")
        print(f"Class Distribution: {y.value_counts().to_dict()}")

        # Classification report for detailed analysis
        print("\nTest Set Classification Report:")
        print(classification_report(y_test, test_pred))

    return results

# Example usage:
results = complete_time_series_workflow(data_dict, horizon=1, threshold=0.02)


Processing AAPL
Train period: 70 to 2272 (2203 samples)
Validation period: 2273 to 3007 (735 samples)
Test period: 3008 to 3742 (735 samples)
Selected 15 features from 557 original features
Top 10 features: ['BB_Position', 'Volume_AAPL', 'BB_Position_lag_1', 'Volume_AAPL_lag_1', 'MACD', 'Momentum_10', 'Rolling_Volatility', 'RSI_lag_5', 'BB_Position_lag_3', 'RSI']
Validation Accuracy: 0.7265
Test Accuracy: 0.7769
Walk-Forward CV: 0.8049 ± 0.0667
Class Distribution: {0: 2993, 1: 354, -1: 326}

Test Set Classification Report:
              precision    recall  f1-score   support

          -1       0.23      0.25      0.24        72
           0       0.84      0.93      0.89       592
           1       0.00      0.00      0.00        71

    accuracy                           0.78       735
   macro avg       0.36      0.39      0.37       735
weighted avg       0.70      0.78      0.74       735


Processing JPM
Train period: 70 to 2272 (2203 samples)
Validation period: 2273 to 3007 (

In [None]:
def analyze_trading_performance(results_summary):
    """
    Translate model metrics into trading reality.
    """

    print("=" * 60)
    print("TRADING REALITY CHECK")
    print("=" * 60)

    # Example for AAPL based on your results
    print("\n📊 AAPL Analysis:")
    print("-" * 30)

    # Your model predictions vs reality
    total_predictions = 735  # test set size
    actual_up_moves = 71
    actual_down_moves = 72

    # What your model actually predicted (based on precision/recall)
    predicted_up_correctly = int(0.00 * actual_up_moves)  # recall = 0.00
    predicted_down_correctly = int(0.25 * actual_down_moves)  # recall = 0.25

    print(f"Total trading opportunities: {total_predictions} days")
    print(f"Actual UP moves: {actual_up_moves} days")
    print(f"Actual DOWN moves: {actual_down_moves} days")
    print(f"Model correctly identified UP moves: {predicted_up_correctly} days")
    print(f"Model correctly identified DOWN moves: {predicted_down_correctly} days")

    # Trading implications
    print(f"\n💰 Trading Implications:")
    print(f"• Your model would miss {actual_up_moves - predicted_up_correctly} profitable UP opportunities")
    print(f"• Your model would miss {actual_down_moves - predicted_down_correctly} profitable DOWN opportunities")
    print(f"• Success rate for directional trades: ~{((predicted_up_correctly + predicted_down_correctly) / (actual_up_moves + actual_down_moves)) * 100:.1f}%")

    # What this means in dollars (hypothetical)
    print(f"\n💸 Hypothetical Trading Scenario:")
    print(f"If each correct prediction = $100 profit:")
    print(f"• Potential profits from UP moves: ${actual_up_moves * 100:,}")
    print(f"• Your model would capture: ${predicted_up_correctly * 100:,}")
    print(f"• Missed profits: ${(actual_up_moves - predicted_up_correctly) * 100:,}")

    return {
        'total_opportunities': actual_up_moves + actual_down_moves,
        'captured_opportunities': predicted_up_correctly + predicted_down_correctly,
        'success_rate': ((predicted_up_correctly + predicted_down_correctly) / (actual_up_moves + actual_down_moves)) * 100
    }

# Run the analysis
trading_analysis = analyze_trading_performance({})

print(f"\n🎯 BOTTOM LINE:")
print(f"Your model is essentially a 'do nothing' strategy disguised as AI.")
print(f"For actual trading, it would likely lose money due to:")
print(f"1. Transaction costs on wrong predictions")
print(f"2. Opportunity costs from missed moves")
print(f"3. False signals leading to losses")

TRADING REALITY CHECK

📊 AAPL Analysis:
------------------------------
Total trading opportunities: 735 days
Actual UP moves: 71 days
Actual DOWN moves: 72 days
Model correctly identified UP moves: 0 days
Model correctly identified DOWN moves: 18 days

💰 Trading Implications:
• Your model would miss 71 profitable UP opportunities
• Your model would miss 54 profitable DOWN opportunities
• Success rate for directional trades: ~12.6%

💸 Hypothetical Trading Scenario:
If each correct prediction = $100 profit:
• Potential profits from UP moves: $7,100
• Your model would capture: $0
• Missed profits: $7,100

🎯 BOTTOM LINE:
Your model is essentially a 'do nothing' strategy disguised as AI.
For actual trading, it would likely lose money due to:
1. Transaction costs on wrong predictions
2. Opportunity costs from missed moves
3. False signals leading to losses
