<a href="https://colab.research.google.com/github/kiran0843/Nifty-50-Directional-Forecasting-with-Deep-Learning/blob/main/Copy_of_niftylstm_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import yfinance as yf
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta


try:
    import pandas_ta
except ImportError:
    !pip install pandas_ta==0.3.14b0

print("üéØ NIFTY 50 DIRECTIONAL FORECASTING - ROBUST ALIGNMENT")
print("="*70)


print("üìÖ Setting up 7-year date range to TODAY...")
today = pd.Timestamp.now().normalize()
start_date = today - pd.DateOffset(years=7)

print(f"Data range: {start_date.strftime('%Y-%m-%d')} to {today.strftime('%Y-%m-%d')} (TODAY)")
print(f"Current time: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S IST')}")


primary_symbol = "^NSEI"  # Nifty 50
correlated_symbols = [
    "^NSEBANK",   # Bank Nifty
    "^NSMIDCP",   # Nifty Midcap
    "^BSESN",     # BSE Sensex
    "^DJI",       # Dow Jones
    "^GSPC",      # S&P 500
    "^IXIC",      # NASDAQ
    "USDINR=X",   # USD/INR
    "CL=F"        # Crude Oil WTI
]

all_symbols = [primary_symbol] + correlated_symbols


print(f"\nüìà Downloading data for {len(all_symbols)} assets...")

raw_assets_data = {}
for symbol in all_symbols:
    try:
        print(f"  Downloading {symbol}...")
        asset_df = yf.download(
            symbol,
            start=start_date.strftime('%Y-%m-%d'),
            end=today.strftime('%Y-%m-%d'),
            progress=False
        )

        if not asset_df.empty:
            # Handle MultiIndex columns if present
            if isinstance(asset_df.columns, pd.MultiIndex):
                asset_df.columns = asset_df.columns.droplevel(1)

            # Keep only OHLCV columns if available
            available_cols = [col for col in ['Open', 'High', 'Low', 'Close', 'Volume']
                             if col in asset_df.columns]
            if available_cols:
                asset_df = asset_df[available_cols]
                raw_assets_data[symbol] = asset_df
                last_date = asset_df.index.max().strftime('%Y-%m-%d')
                print(f"    ‚úÖ {len(asset_df)} samples, last data: {last_date}")
            else:
                print(f"    ‚ùå No OHLCV columns found")
        else:
            print(f"    ‚ùå No data downloaded")

    except Exception as e:
        print(f"    ‚ùå Failed: {str(e)}")

print(f"Successfully downloaded {len(raw_assets_data)} assets")

# ===== STEP 4: CREATE EXTENDED DATE INDEX TO TODAY =====
print(f"\nüîó Creating extended date index to TODAY...")

# Get earliest date from any asset
earliest_date = min(df.index.min() for df in raw_assets_data.values())

# Create business day range from earliest date to TODAY
extended_dates = pd.date_range(start=earliest_date, end=today, freq='B')
print(f"Extended date range: {len(extended_dates)} business days to TODAY")
print(f"Range: {earliest_date.strftime('%Y-%m-%d')} to {today.strftime('%Y-%m-%d')}")

# ===== STEP 5: ROBUST ALIGNMENT FUNCTION =====
def robust_align_and_fill(asset_df, target_index, max_forward_fill=10):
    """
    Robustly align asset data to target index and forward fill
    """
    # Create empty DataFrame with target index
    aligned_df = pd.DataFrame(index=target_index, columns=asset_df.columns, dtype=float)

    # Use reindex to safely align asset data to target index
    asset_reindexed = asset_df.reindex(target_index)

    # Fill the aligned DataFrame (this avoids KeyError)
    for col in asset_df.columns:
        aligned_df[col] = asset_reindexed[col]

    # Apply forward fill with limit using pandas method
    for col in aligned_df.columns:
        aligned_df[col] = aligned_df[col].ffill(limit=max_forward_fill)

    return aligned_df

# ===== STEP 6: ROBUSTLY ALIGN ALL ASSETS TO TODAY'S DATE =====
print(f"\nüìÖ Robustly aligning all assets to TODAY...")

aligned_assets = {}
for symbol, asset_df in raw_assets_data.items():
    print(f"  Processing {symbol}...")

    # Use robust alignment function
    aligned_filled = robust_align_and_fill(asset_df, extended_dates, max_forward_fill=10)

    # Calculate data quality metrics
    original_samples = len(asset_df)
    filled_samples = int(aligned_filled['Close'].count())
    fill_ratio = float(filled_samples) / float(len(aligned_filled))

    # Get last actual data date vs today
    last_actual = asset_df.index.max()
    days_to_fill = (today - last_actual).days if today > last_actual else 0

    print(f"    Original: {original_samples}, Filled: {filled_samples}, Coverage: {fill_ratio:.2%}")
    print(f"    Last actual data: {last_actual.strftime('%Y-%m-%d')} ({days_to_fill} days to today)")

    if fill_ratio > 0.3:  # Lower threshold to keep more assets for forecasting
        aligned_assets[symbol] = aligned_filled
        print(f"    ‚úÖ Kept (extended to TODAY)")
    else:
        print(f"    ‚ùå Dropped (low coverage)")

print(f"\n‚úÖ Successfully aligned {len(aligned_assets)} assets")

# ===== STEP 7: CREATE MULTI-ASSET DATAFRAME TO TODAY =====
print(f"\nüîÑ Creating multi-asset DataFrame extended to TODAY...")

# Stack all assets into single DataFrame
asset_data_list = []
for symbol, df in aligned_assets.items():
    df_copy = df.copy()
    df_copy['Symbol'] = symbol
    df_copy = df_copy.reset_index()
    df_copy = df_copy.rename(columns={'index': 'Date'})
    asset_data_list.append(df_copy)

# Combine all assets
multi_asset_df = pd.concat(asset_data_list, ignore_index=True)
multi_asset_df = multi_asset_df.dropna(subset=['Close'])

print(f"‚úÖ Multi-asset DataFrame created (EXTENDED TO TODAY):")
print(f"   Total samples: {len(multi_asset_df):,}")
print(f"   Assets: {multi_asset_df['Symbol'].nunique()}")
print(f"   Date range: {multi_asset_df['Date'].min().strftime('%Y-%m-%d')} to {multi_asset_df['Date'].max().strftime('%Y-%m-%d')}")

# Check if we actually reached today
max_date_in_data = multi_asset_df['Date'].max()
if max_date_in_data.date() == today.date():
    print(f"   ‚úÖ SUCCESS: Data extends to TODAY ({today.strftime('%Y-%m-%d')})")
elif max_date_in_data.date() >= (today - pd.Timedelta(days=3)).date():
    print(f"   ‚úÖ RECENT: Data extends to {max_date_in_data.strftime('%Y-%m-%d')} (within 3 days)")
else:
    print(f"   ‚ö†Ô∏è GAP: Latest data is {max_date_in_data.strftime('%Y-%m-%d')}, today is {today.strftime('%Y-%m-%d')}")

# Separate primary asset for main modeling
nifty_data = multi_asset_df[multi_asset_df['Symbol'] == primary_symbol].copy()
nifty_data = nifty_data.sort_values('Date').reset_index(drop=True)

print(f"   Primary (Nifty 50): {len(nifty_data):,} samples")

# Show the most recent Nifty 50 data to confirm
if len(nifty_data) > 0:
    latest_nifty = nifty_data.iloc[-1]
    print(f"   Latest Nifty data: {latest_nifty['Date'].strftime('%Y-%m-%d')}, Close: {latest_nifty['Close']:.2f}")

    # Show if we have today's data or forward-filled data
    second_latest = nifty_data.iloc[-2] if len(nifty_data) > 1 else None
    if second_latest is not None and latest_nifty['Close'] == second_latest['Close']:
        print(f"   üìã Note: Latest data appears forward-filled from {second_latest['Date'].strftime('%Y-%m-%d')}")
    else:
        print(f"   üìã Latest data appears to be actual market data")

print(f"\nüéØ READY FOR NEXT-DAY FORECASTING!")
print(f"‚úÖ Data collection to TODAY complete - pipeline always up-to-date!")

[31mERROR: Could not find a version that satisfies the requirement pandas_ta==0.3.14b0 (from versions: 0.4.67b0, 0.4.71b0)[0m[31m
[0m[31mERROR: No matching distribution found for pandas_ta==0.3.14b0[0m[31m
[0müéØ NIFTY 50 DIRECTIONAL FORECASTING - ROBUST ALIGNMENT
üìÖ Setting up 7-year date range to TODAY...
Data range: 2018-12-03 to 2025-12-03 (TODAY)
Current time: 2025-12-03 03:41:03 IST

üìà Downloading data for 9 assets...
  Downloading ^NSEI...
    ‚úÖ 1727 samples, last data: 2025-12-02
  Downloading ^NSEBANK...
    ‚úÖ 1720 samples, last data: 2025-12-02
  Downloading ^NSMIDCP...
    ‚úÖ 1719 samples, last data: 2025-12-02
  Downloading ^BSESN...
    ‚úÖ 1723 samples, last data: 2025-12-02
  Downloading ^DJI...
    ‚úÖ 1759 samples, last data: 2025-12-02
  Downloading ^GSPC...
    ‚úÖ 1759 samples, last data: 2025-12-02
  Downloading ^IXIC...
    ‚úÖ 1759 samples, last data: 2025-12-02
  Downloading USDINR=X...
    ‚úÖ 1823 samples, last data: 2025-12-02
  Downloading

In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

print("="*70)
print("CELL 1B: EXTERNAL DATA + ADVANCED FEATURES V2")
print("="*70 + "\n")

start_date = pd.to_datetime(nifty_data['Date'].min()) - timedelta(days=100)
end_date = pd.to_datetime(nifty_data['Date'].max())

print(f"Fetching external data: {start_date.date()} to {end_date.date()}\n")

def safe_download(ticker, name, start, end):
    try:
        print(f"Downloading {name}...")
        data = yf.download(ticker, start=start, end=end, progress=False)
        if len(data) > 0:
            print(f"  ‚úÖ {len(data)} rows")
            return data
        else:
            print(f"  ‚ö†Ô∏è No data")
            return None
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return None

external_data = {}

indiavix = safe_download("^INDIAVIX", "India VIX", start_date, end_date)
if indiavix is not None:
    external_data['INDIAVIX'] = indiavix[['Close']].rename(columns={'Close': 'INDIAVIX'})

usdinr = safe_download("USDINR=X", "USD/INR", start_date, end_date)
if usdinr is not None:
    external_data['USDINR'] = usdinr[['Close']].rename(columns={'Close': 'USDINR'})

oil = safe_download("CL=F", "Crude Oil", start_date, end_date)
if oil is not None:
    external_data['OIL'] = oil[['Close']].rename(columns={'Close': 'OIL'})

sp500 = safe_download("^GSPC", "S&P 500", start_date, end_date)
if sp500 is not None:
    external_data['SP500'] = sp500[['Close']].rename(columns={'Close': 'SP500'})

hsi = safe_download("^HSI", "Hang Seng", start_date, end_date)
if hsi is not None:
    external_data['HSI'] = hsi[['Close']].rename(columns={'Close': 'HSI'})

us10y = safe_download("^TNX", "US 10Y", start_date, end_date)
if us10y is not None:
    external_data['US10Y'] = us10y[['Close']].rename(columns={'Close': 'US10Y'})

print(f"\n‚úÖ Downloaded {len(external_data)} external datasets\n")

df_enhanced = nifty_data.copy()
df_enhanced['Date'] = pd.to_datetime(df_enhanced['Date'])
df_enhanced = df_enhanced.sort_values('Date').reset_index(drop=True)

for name, data in external_data.items():
    data_reset = data.reset_index()
    data_reset.columns = ['Date', name]
    data_reset['Date'] = pd.to_datetime(data_reset['Date'])

    df_enhanced = pd.merge(df_enhanced, data_reset, on='Date', how='left', suffixes=('', '_new'))

    if f'{name}_new' in df_enhanced.columns:
        df_enhanced[name] = df_enhanced[f'{name}_new']
        df_enhanced = df_enhanced.drop(columns=[f'{name}_new'])

    if name in df_enhanced.columns:
        df_enhanced[name] = df_enhanced[name].fillna(method='ffill').fillna(method='bfill')

print(f"üìä Enhanced dataset: {df_enhanced.shape}")
print(f"   New columns: {[c for c in df_enhanced.columns if c not in nifty_data.columns]}\n")

print("üîß Creating advanced features...\n")

def create_advanced_features(df):
    df = df.sort_values('Date').copy()

    df['Return_1d'] = df['Close'].pct_change()
    df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

    print("  ‚Ä¢ Volatility features...")
    df['RealVol_5d'] = df['Return_1d'].rolling(5).std()
    df['RealVol_10d'] = df['Return_1d'].rolling(10).std()
    df['RealVol_20d'] = df['Return_1d'].rolling(20).std()
    df['Parkinson_Vol'] = np.sqrt(1/(4*np.log(2)) * (np.log(df['High']/df['Low'])**2).rolling(10).mean())

    high_low = df['High'] - df['Low']
    high_close = np.abs(df['High'] - df['Close'].shift())
    low_close = np.abs(df['Low'] - df['Close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['ATR_14'] = true_range.rolling(14).mean()
    df['Vol_Adj_Return'] = df['Return_1d'] / (df['RealVol_20d'] + 1e-6)

    print("  ‚Ä¢ Volume/microstructure features...")
    df['VWAP'] = (df['High'] + df['Low'] + df['Close']) / 3
    df['VWAP_Distance'] = (df['Close'] - df['VWAP']) / (df['VWAP'] + 1e-8)
    df['Volume_MA20'] = df['Volume'].rolling(20).mean()
    df['Volume_Ratio'] = df['Volume'] / (df['Volume_MA20'] + 1)
    df['Volume_Momentum'] = df['Volume'].pct_change(5)
    df['PVT'] = ((df['Close'] - df['Close'].shift()) / (df['Close'].shift() + 1e-8) * df['Volume']).cumsum()
    df['PVT_MA'] = df['PVT'].rolling(20).mean()

    print("  ‚Ä¢ Cross-asset correlations...")
    if 'INDIAVIX' in df.columns:
        df['VIX_Level'] = df['INDIAVIX']
        df['VIX_Change'] = df['INDIAVIX'].pct_change()
        df['VIX_MA10'] = df['INDIAVIX'].rolling(10, min_periods=5).mean()
        df['VIX_Regime'] = (df['INDIAVIX'] > df['VIX_MA10']).astype(int)

    if 'USDINR' in df.columns:
        df['Currency_Return'] = df['USDINR'].pct_change()
        df['Currency_MA'] = df['USDINR'].rolling(20, min_periods=10).mean()
        df['Currency_Strength'] = (df['USDINR'] - df['Currency_MA']) / (df['Currency_MA'] + 1e-8)

    if 'SP500' in df.columns:
        df['SP500_Return'] = df['SP500'].pct_change()
        df['SP500_Corr_30d'] = df['Return_1d'].rolling(30, min_periods=20).corr(df['SP500_Return'])
        df['SP500_Corr_30d'] = df['SP500_Corr_30d'].fillna(0)
        df['SP500_Lead'] = df['SP500'].pct_change().shift(1)

    if 'OIL' in df.columns:
        df['Oil_Return'] = df['OIL'].pct_change()
        df['Oil_Momentum_5d'] = df['Oil_Return'].rolling(5, min_periods=3).mean()

    if 'HSI' in df.columns:
        df['HSI_Return'] = df['HSI'].pct_change()
        df['Asia_Lead'] = df['HSI_Return'].shift(1)

    if 'US10Y' in df.columns:
        df['Yield_Level'] = df['US10Y']
        df['Yield_Change'] = df['US10Y'].diff()

    print("  ‚Ä¢ Lagged features...")
    for lag in [1, 2, 3, 5, 10]:
        df[f'Return_Lag{lag}'] = df['Return_1d'].shift(lag)
        if 'Volume_Ratio' in df.columns:
            df[f'Volume_Lag{lag}'] = df['Volume_Ratio'].shift(lag)

    df['Momentum_5d'] = df['Close'].pct_change(5)
    df['Momentum_10d'] = df['Close'].pct_change(10)
    df['Momentum_20d'] = df['Close'].pct_change(20)

    print("  ‚Ä¢ Regime features...")
    df['SMA_50'] = df['Close'].rolling(50).mean()
    df['SMA_200'] = df['Close'].rolling(200).mean()
    df['Trend_Regime'] = ((df['Close'] > df['SMA_50']) & (df['SMA_50'] > df['SMA_200'])).astype(int)
    df['Vol_Regime'] = (df['RealVol_20d'] > df['RealVol_20d'].rolling(100, min_periods=50).median()).astype(int)
    df['Daily_Range'] = (df['High'] - df['Low']) / (df['Close'] + 1e-8)
    df['Range_Regime'] = (df['Daily_Range'] > df['Daily_Range'].rolling(50, min_periods=25).median()).astype(int)

    print("  ‚Ä¢ High-performance features...")

    if 'SP500' in df.columns and 'SP500_Return' in df.columns:
        df['SP500_Corr_60d'] = df['Return_1d'].rolling(60, min_periods=40).corr(df['SP500_Return'])
        df['SP500_Corr_60d'] = df['SP500_Corr_60d'].fillna(0)

    if 'SMA_50' in df.columns and 'SMA_200' in df.columns:
        df['Trend_Signal'] = (df['SMA_50'] / (df['SMA_200'] + 1e-8) - 1)
        trend_signal_change = df['Trend_Signal'].pct_change()
        df['Corr_Trend'] = df['Return_1d'].rolling(30, min_periods=20).corr(trend_signal_change)
        df['Corr_Trend'] = df['Corr_Trend'].fillna(0)
        df['RelStrength_Trend'] = df['Close'] / (df['SMA_50'] + 1e-8)

    if 'INDIAVIX' in df.columns:
        vix_ma60 = df['INDIAVIX'].rolling(60, min_periods=30).mean()
        df['VIX_Ratio'] = df['INDIAVIX'] / (vix_ma60 + 1)
        df['VIX_Acceleration'] = df['VIX_Change'].diff()

    if 'USDINR' in df.columns and 'Currency_Return' in df.columns:
        df['Currency_Vol'] = df['Currency_Return'].rolling(20, min_periods=10).std()
        df['Currency_Momentum'] = df['USDINR'].pct_change(10)

    if 'OIL' in df.columns and 'Oil_Return' in df.columns:
        df['Oil_Nifty_Corr'] = df['Return_1d'].rolling(30, min_periods=20).corr(df['Oil_Return'])
        df['Oil_Nifty_Corr'] = df['Oil_Nifty_Corr'].fillna(0)

    print("  ‚úÖ All features created\n")

    return df

df_enhanced = create_advanced_features(df_enhanced)

nifty_data_original = nifty_data.copy()
nifty_data = df_enhanced.copy()

print("="*70)
print(f"‚úÖ ENHANCED DATASET READY V2")
print(f"   Original features: {nifty_data_original.shape[1]}")
print(f"   Enhanced features: {nifty_data.shape[1]}")
print(f"   New features added: {nifty_data.shape[1] - nifty_data_original.shape[1]}")
print(f"\nüìä Expected usable features in Cell 8: ~{nifty_data.shape[1] - 15}")
print("="*70 + "\n")


CELL 1B: EXTERNAL DATA + ADVANCED FEATURES V2

Fetching external data: 2018-08-25 to 2025-12-03

Downloading India VIX...
  ‚úÖ 1776 rows
Downloading USD/INR...
  ‚úÖ 1893 rows
Downloading Crude Oil...
  ‚úÖ 1825 rows
Downloading S&P 500...
  ‚úÖ 1827 rows
Downloading Hang Seng...
  ‚úÖ 1789 rows
Downloading US 10Y...
  ‚úÖ 1827 rows

‚úÖ Downloaded 6 external datasets

üìä Enhanced dataset: (1828, 13)
   New columns: ['INDIAVIX', 'USDINR', 'OIL', 'SP500', 'HSI', 'US10Y']

üîß Creating advanced features...

  ‚Ä¢ Volatility features...
  ‚Ä¢ Volume/microstructure features...
  ‚Ä¢ Cross-asset correlations...
  ‚Ä¢ Lagged features...
  ‚Ä¢ Regime features...
  ‚Ä¢ High-performance features...
  ‚úÖ All features created

‚úÖ ENHANCED DATASET READY V2
   Original features: 7
   Enhanced features: 72
   New features added: 65

üìä Expected usable features in Cell 8: ~57



In [None]:

print("‚è∞ TEMPORAL SPLIT - LEAK-PROOF APPROACH")
print("="*50)


print("üìÖ Defining temporal split boundaries...")


data_start = nifty_data['Date'].min()
data_end = nifty_data['Date'].max()


train_end_date = data_start + pd.DateOffset(years=5)
val_end_date = data_start + pd.DateOffset(years=6)
test_end_date = data_end

print(f"Data range: {data_start.strftime('%Y-%m-%d')} to {data_end.strftime('%Y-%m-%d')}")
print(f"Train period: {data_start.strftime('%Y-%m-%d')} to {train_end_date.strftime('%Y-%m-%d')}")
print(f"Validation period: {train_end_date.strftime('%Y-%m-%d')} to {val_end_date.strftime('%Y-%m-%d')}")
print(f"Test period: {val_end_date.strftime('%Y-%m-%d')} to {test_end_date.strftime('%Y-%m-%d')}")

# ===== STEP 2: SPLIT PRIMARY ASSET DATA =====
print("\nüéØ Splitting primary asset (Nifty 50) temporally...")

train_data = nifty_data[nifty_data['Date'] <= train_end_date].copy()
val_data = nifty_data[(nifty_data['Date'] > train_end_date) &
                      (nifty_data['Date'] <= val_end_date)].copy()
test_data = nifty_data[nifty_data['Date'] > val_end_date].copy()

print(f"‚úÖ Primary asset splits:")
print(f"   Train: {len(train_data):,} samples ({len(train_data)*100/len(nifty_data):.1f}%)")
print(f"   Validation: {len(val_data):,} samples ({len(val_data)*100/len(nifty_data):.1f}%)")
print(f"   Test: {len(test_data):,} samples ({len(test_data)*100/len(nifty_data):.1f}%)")

# ===== STEP 3: SPLIT MULTI-ASSET DATA FOR TRANSFER LEARNING =====
print("\nüåê Splitting multi-asset data temporally...")

multi_train_data = multi_asset_df[multi_asset_df['Date'] <= train_end_date].copy()
multi_val_data = multi_asset_df[(multi_asset_df['Date'] > train_end_date) &
                                (multi_asset_df['Date'] <= val_end_date)].copy()
multi_test_data = multi_asset_df[multi_asset_df['Date'] > val_end_date].copy()

print(f"‚úÖ Multi-asset splits:")
print(f"   Train: {len(multi_train_data):,} samples")
print(f"   Validation: {len(multi_val_data):,} samples")
print(f"   Test: {len(multi_test_data):,} samples")

# ===== STEP 4: VERIFY NO TEMPORAL LEAKAGE =====
print("\nüîç Verifying no temporal leakage...")

train_max_date = train_data['Date'].max()
val_min_date = val_data['Date'].min()
val_max_date = val_data['Date'].max()
test_min_date = test_data['Date'].min()

print(f"Train ends: {train_max_date.strftime('%Y-%m-%d')}")
print(f"Val starts: {val_min_date.strftime('%Y-%m-%d')}")
print(f"Val ends: {val_max_date.strftime('%Y-%m-%d')}")
print(f"Test starts: {test_min_date.strftime('%Y-%m-%d')}")

# Check for gaps
gap1 = (val_min_date - train_max_date).days
gap2 = (test_min_date - val_max_date).days

if gap1 >= 0 and gap2 >= 0:
    print("‚úÖ NO TEMPORAL LEAKAGE: Clean chronological splits")
else:
    print("‚ùå WARNING: Possible temporal overlap detected!")

print("\n‚úÖ TEMPORAL SPLIT COMPLETE!")


‚è∞ TEMPORAL SPLIT - LEAK-PROOF APPROACH
üìÖ Defining temporal split boundaries...
Data range: 2018-12-03 to 2025-12-03
Train period: 2018-12-03 to 2023-12-03
Validation period: 2023-12-03 to 2024-12-03
Test period: 2024-12-03 to 2025-12-03

üéØ Splitting primary asset (Nifty 50) temporally...
‚úÖ Primary asset splits:
   Train: 1,305 samples (71.4%)
   Validation: 262 samples (14.3%)
   Test: 261 samples (14.3%)

üåê Splitting multi-asset data temporally...
‚úÖ Multi-asset splits:
   Train: 11,745 samples
   Validation: 2,358 samples
   Test: 2,349 samples

üîç Verifying no temporal leakage...
Train ends: 2023-12-01
Val starts: 2023-12-04
Val ends: 2024-12-03
Test starts: 2024-12-04
‚úÖ NO TEMPORAL LEAKAGE: Clean chronological splits

‚úÖ TEMPORAL SPLIT COMPLETE!


In [None]:

import numpy as np
import pandas as pd

print("üîß ENHANCED FEATURE ENGINEERING WITH EXTERNAL SIGNALS")
print("="*65)

def create_comprehensive_features(df):
    """
    Create comprehensive feature set including external market signals
    """
    df = df.copy()

    # ===== ORIGINAL TECHNICAL FEATURES =====
    print("üìà Creating original technical features...")

    # Price-based features
    df['Returns'] = df['Close'].pct_change()
    df['Log_Returns'] = np.log(df['Close'] / df['Close'].shift(1))
    df['Price_Change'] = df['Close'] - df['Open']
    df['Daily_Range'] = df['High'] - df['Low']
    df['Body_Size'] = abs(df['Close'] - df['Open'])

    # Moving averages
    df['SMA_5'] = df['Close'].rolling(5, min_periods=1).mean()
    df['SMA_10'] = df['Close'].rolling(10, min_periods=1).mean()
    df['SMA_20'] = df['Close'].rolling(20, min_periods=1).mean()
    df['SMA_50'] = df['Close'].rolling(50, min_periods=1).mean()

    # Exponential moving averages
    df['EMA_12'] = df['Close'].ewm(span=12).mean()
    df['EMA_26'] = df['Close'].ewm(span=26).mean()

    # MACD
    df['MACD'] = df['EMA_12'] - df['EMA_26']
    df['MACD_Signal'] = df['MACD'].ewm(span=9).mean()
    df['MACD_Hist'] = df['MACD'] - df['MACD_Signal']  # FIXED: Added closing bracket

    # RSI
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14, min_periods=1).mean()
    rs = gain / loss.replace(0, 0.0001)
    df['RSI_14'] = 100 - (100 / (1 + rs))

    # Bollinger Bands
    sma_20 = df['SMA_20']
    std_20 = df['Close'].rolling(20, min_periods=1).std()
    df['BB_upper'] = sma_20 + (2 * std_20)
    df['BB_lower'] = sma_20 - (2 * std_20)
    df['BB_width'] = df['BB_upper'] - df['BB_lower']
    df['BB_position'] = (df['Close'] - df['BB_lower']) / (df['BB_upper'] - df['BB_lower'])

    # ===== ENHANCED EXTERNAL FEATURES =====
    print("üåç Enhancing external signal features...")

    # Market regime detection based on multiple signals
    if 'Vol_Regime' in df.columns and 'VIX_Ratio' in df.columns:
        # Combined volatility signal
        df['Market_Stress'] = (df['Vol_Regime'] == 2).astype(int) * df['VIX_Ratio']

    # Inter-market momentum features
    if 'Corr_SP500_30' in df.columns:
        # Correlation stability (low = potential regime change)
        df['Corr_Stability'] = df['Corr_SP500_30'].rolling(10).std()

        # Correlation regime
        corr_percentile = df['Corr_SP500_30'].rolling(126).rank(pct=True)  # 6-month percentile
        df['Corr_Regime'] = np.where(corr_percentile > 0.8, 2,  # High correlation
                                   np.where(corr_percentile < 0.2, 0, 1))  # Low, Normal, High

    # Relative strength momentum
    if 'RelStrength_MSCI' in df.columns:
        df['RelStrength_MA'] = df['RelStrength_MSCI'].rolling(20).mean()
        df['RelStrength_Signal'] = np.where(df['RelStrength_MSCI'] > df['RelStrength_MA'], 1, -1)

    # Currency and commodity combined impact
    if 'INR_Strength' in df.columns and 'Oil_Momentum' in df.columns:
        # Combined external pressure (strong INR + low oil = positive for equities)
        df['External_Pressure'] = df['INR_Strength'] - df['Oil_Momentum']

    # Market breadth features
    if 'AD_Line' in df.columns:
        # A/D Line momentum
        df['AD_Momentum'] = df['AD_Line'] - df['AD_Line'].shift(10)

        # Breadth divergence (price up but breadth down = bearish)
        price_trend = df['Close'].pct_change(10)
        ad_trend = df['AD_Line'].pct_change(10)
        df['Breadth_Divergence'] = price_trend - ad_trend

    # Sentiment features
    if 'PCR' in df.columns:
        # PCR moving average for trend
        df['PCR_MA'] = df['PCR'].rolling(10).mean()

        # Extreme sentiment readings
        df['Extreme_Fear'] = (df['PCR'] > 1.3).astype(int)  # Contrarian bullish
        df['Extreme_Greed'] = (df['PCR'] < 0.7).astype(int)  # Contrarian bearish

    # ===== INTERACTION FEATURES =====
    print("üîó Creating interaction features...")

    # Technical + External interactions
    if 'VIX_Ratio' in df.columns:
        # RSI effectiveness in different volatility regimes
        df['RSI_Vol_Adjusted'] = df['RSI_14'] * (1 / (1 + df['VIX_Ratio']))

    if 'Corr_SP500_30' in df.columns:
        # MACD effectiveness based on correlation regime
        df['MACD_Corr_Adjusted'] = df['MACD_Hist'] * (1 + abs(df['Corr_SP500_30']))

    # ===== REGIME-BASED FEATURES =====
    print("üìä Creating regime-based features...")

    # Market regime classification (combine multiple signals)
    regime_score = 0

    if 'Vol_Regime' in df.columns:
        regime_score += df['Vol_Regime']  # 0-2

    if 'Corr_Regime' in df.columns:
        regime_score += df['Corr_Regime']  # 0-2

    if 'PCR_Signal' in df.columns:
        regime_score += (df['PCR_Signal'] + 1)  # -1,0,1 -> 0,1,2

    # Overall market regime (0=calm, higher=stressed)
    df['Market_Regime'] = regime_score

    # Fill NaN and infinite values
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)

    return df

# Apply comprehensive feature engineering to train, val, and test data
print("üìä Applying enhanced features to all data splits...")

train_features_df = create_comprehensive_features(train_data)
val_features_df = create_comprehensive_features(val_data)
test_features_df = create_comprehensive_features(test_data)

print(f"‚úÖ Enhanced features applied:")
print(f"   Train: {train_features_df.shape}")
print(f"   Validation: {val_features_df.shape}")
print(f"   Test: {test_features_df.shape}")

# Select feature columns (exclude non-feature columns)
exclude_cols = ['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume']
feature_columns = [col for col in train_features_df.columns if col not in exclude_cols]

print(f"\nüéØ Feature selection:")
print(f"   Total feature columns: {len(feature_columns)}")
print(f"   External signal features: {len([col for col in feature_columns if any(x in col for x in ['Corr_', 'VIX_', 'RelStrength', 'INR_', 'Oil_', 'AD_', 'PCR', 'Market_', 'External_'])])}")

# Extract feature matrices
X_train_raw = train_features_df[feature_columns].values
X_val_raw = val_features_df[feature_columns].values
X_test_raw = test_features_df[feature_columns].values

print(f"‚úÖ Enhanced feature matrices created:")
print(f"   Train: {X_train_raw.shape}")
print(f"   Validation: {X_val_raw.shape}")
print(f"   Test: {X_test_raw.shape}")

print("\n‚úÖ ENHANCED FEATURE ENGINEERING COMPLETE!")


üîß ENHANCED FEATURE ENGINEERING WITH EXTERNAL SIGNALS
üìä Applying enhanced features to all data splits...
üìà Creating original technical features...
üåç Enhancing external signal features...
üîó Creating interaction features...
üìä Creating regime-based features...
üìà Creating original technical features...
üåç Enhancing external signal features...
üîó Creating interaction features...
üìä Creating regime-based features...
üìà Creating original technical features...
üåç Enhancing external signal features...
üîó Creating interaction features...
üìä Creating regime-based features...
‚úÖ Enhanced features applied:
   Train: (1305, 92)
   Validation: (262, 92)
   Test: (261, 92)

üéØ Feature selection:
   Total feature columns: 85
   External signal features: 15
‚úÖ Enhanced feature matrices created:
   Train: (1305, 85)
   Validation: (262, 85)
   Test: (261, 85)

‚úÖ ENHANCED FEATURE ENGINEERING COMPLETE!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

print("üéØ DIRECTIONAL LABEL CREATION WITH DEAD ZONE")
print("="*55)


threshold = 0.001

print(f"Dead zone threshold: ¬±{threshold*100:.1f}%")
print("Rationale: Covers transaction costs + noise filtering")


def calculate_next_return(df):
    """Calculate next-day returns"""
    df = df.copy()
    df['Next_Close'] = df['Close'].shift(-1)
    df['Next_Return'] = (df['Next_Close'] - df['Close']) / df['Close']
    return df

# Apply to all datasets
print("\nüìà Calculating next-day returns...")

train_with_returns = calculate_next_return(train_features_df)
val_with_returns = calculate_next_return(val_features_df)
test_with_returns = calculate_next_return(test_features_df)


def create_directional_labels(returns, threshold):
    """
    Create directional labels with dead zone
    1 = UP (return > +threshold)
    0 = DOWN (return < -threshold)
    NaN = DEAD ZONE (within ¬±threshold) - to be dropped
    """
    labels = np.where(returns > threshold, 1,
                     np.where(returns < -threshold, 0, np.nan))
    return labels

print(f"\nüè∑Ô∏è Creating directional labels...")

y_train_raw = create_directional_labels(train_with_returns['Next_Return'].values, threshold)
y_val_raw = create_directional_labels(val_with_returns['Next_Return'].values, threshold)
y_test_raw = create_directional_labels(test_with_returns['Next_Return'].values, threshold)

# ===== STEP 4: REMOVE DEAD ZONE SAMPLES =====
print("üóëÔ∏è Removing dead zone samples...")

# Find valid indices (non-NaN labels)
train_valid_idx = ~np.isnan(y_train_raw)
val_valid_idx = ~np.isnan(y_val_raw)
test_valid_idx = ~np.isnan(y_test_raw)

# Filter features and labels
X_train_filtered = X_train_raw[train_valid_idx]
y_train_filtered = y_train_raw[train_valid_idx].astype(int)

X_val_filtered = X_val_raw[val_valid_idx]
y_val_filtered = y_val_raw[val_valid_idx].astype(int)

X_test_filtered = X_test_raw[test_valid_idx]
y_test_filtered = y_test_raw[test_valid_idx].astype(int)

print(f"‚úÖ Filtered datasets:")
print(f"   Train: {X_train_raw.shape[0]} ‚Üí {X_train_filtered.shape[0]} ({np.mean(train_valid_idx)*100:.1f}% kept)")
print(f"   Validation: {X_val_raw.shape[0]} ‚Üí {X_val_filtered.shape[0]} ({np.mean(val_valid_idx)*100:.1f}% kept)")
print(f"   Test: {X_test_raw.shape[0]} ‚Üí {X_test_filtered.shape[0]} ({np.mean(test_valid_idx)*100:.1f}% kept)")

# ===== STEP 5: CLASS DISTRIBUTION ANALYSIS =====
print(f"\nüìä Class distribution analysis:")

for split_name, labels in [("Train", y_train_filtered), ("Val", y_val_filtered), ("Test", y_test_filtered)]:
    up_count = np.sum(labels == 1)
    down_count = np.sum(labels == 0)
    total = len(labels)
    up_pct = up_count / total * 100
    down_pct = down_count / total * 100

    print(f"   {split_name}: UP={up_count} ({up_pct:.1f}%), DOWN={down_count} ({down_pct:.1f}%)")

# Calculate class weights for imbalanced learning
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced',
                                   classes=np.array([0, 1]),
                                   y=y_train_filtered)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"\n‚öñÔ∏è Computed class weights:")
print(f"   DOWN (0): {class_weights[0]:.3f}")
print(f"   UP (1): {class_weights[1]:.3f}")

print("\n‚úÖ DIRECTIONAL LABELS WITH DEAD ZONE COMPLETE!")


In [None]:

from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

print("‚öñÔ∏è FEATURE SCALING & SELECTION - LEAK-PROOF")
print("="*50)

# ===== STEP 1: FIT SCALER ONLY ON TRAIN DATA =====
print("üîß Fitting RobustScaler on train data only...")

# Use RobustScaler to handle outliers better than StandardScaler
feature_scaler = RobustScaler()

# FIT only on train data
X_train_scaled = feature_scaler.fit_transform(X_train_filtered)

# TRANSFORM val and test (no fitting)
X_val_scaled = feature_scaler.transform(X_val_filtered)
X_test_scaled = feature_scaler.transform(X_test_filtered)

print(f"‚úÖ Scaling complete:")
print(f"   Feature scaler fitted on {X_train_scaled.shape[0]} train samples")
print(f"   Applied to all splits")

# ===== STEP 2: FEATURE SELECTION ON TRAIN DATA ONLY =====
print(f"\nüéØ Feature selection using mutual information (train only)...")

# Use mutual information for feature selection (good for non-linear relationships)
selector = SelectKBest(mutual_info_classif, k=20)  # Select top 20 features

# FIT selector only on train data
X_train_selected = selector.fit_transform(X_train_scaled, y_train_filtered)

# TRANSFORM val and test using fitted selector
X_val_selected = selector.transform(X_val_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features = [feature_columns[i] for i in selector.get_support(indices=True)]
feature_scores = selector.scores_

print(f"‚úÖ Feature selection complete:")
print(f"   Selected {X_train_selected.shape[1]} features from {X_train_scaled.shape[1]}")
print(f"   Selection fitted on train data only")

print(f"\nüèÜ Top 10 selected features:")
feature_importance = list(zip(selected_features, feature_scores[selector.get_support()]))
feature_importance.sort(key=lambda x: x[1], reverse=True)
for i, (feature, score) in enumerate(feature_importance[:10]):
    print(f"   {i+1:2d}. {feature:<20} (score: {score:.4f})")


print(f"\nüìä Final processed datasets:")
print(f"   Train: {X_train_selected.shape} features, {len(y_train_filtered)} labels")
print(f"   Validation: {X_val_selected.shape} features, {len(y_val_filtered)} labels")
print(f"   Test: {X_test_selected.shape} features, {len(y_test_filtered)} labels")


train_dates = train_with_returns[train_valid_idx]['Date'].values
val_dates = val_with_returns[val_valid_idx]['Date'].values
test_dates = test_with_returns[test_valid_idx]['Date'].values

print(f"\n‚úÖ Date information preserved for sequence creation")

print("\n‚úÖ FEATURE SCALING & SELECTION COMPLETE!")


In [None]:

print("üì± SEQUENCE CREATION WITH STRIDE")
print("="*40)


def create_sequences_with_stride(X, y, dates, sequence_length=30, stride=3):
    """
    Create sequences with stride > 1 to reduce redundancy

    Args:
        X: Feature matrix
        y: Labels
        dates: Date array
        sequence_length: Length of each sequence
        stride: Step size between sequences (>1 to reduce overlap)

    Returns:
        X_sequences, y_sequences, sequence_dates
    """
    sequences_X = []
    sequences_y = []
    sequences_dates = []


    for i in range(sequence_length, len(X), stride):

        seq_x = X[i-sequence_length:i]
        seq_y = y[i]  # Next day label
        seq_date = dates[i]  # Date of prediction

        sequences_X.append(seq_x)
        sequences_y.append(seq_y)
        sequences_dates.append(seq_date)

    return (np.array(sequences_X),
            np.array(sequences_y),
            np.array(sequences_dates))


sequence_length = 30
stride = 3

print(f"Sequence parameters:")
print(f"   Sequence length: {sequence_length} days")
print(f"   Stride: {stride} (reduces samples by ~{stride}x)")
print(f"   Redundancy reduction: {(1-1/stride)*100:.1f}%")

# ===== STEP 3: CREATE SEQUENCES FOR ALL SPLITS =====
print(f"\nüîÑ Creating sequences with stride...")

# Train sequences
X_train_seq, y_train_seq, train_seq_dates = create_sequences_with_stride(
    X_train_selected, y_train_filtered, train_dates, sequence_length, stride
)

# Validation sequences
X_val_seq, y_val_seq, val_seq_dates = create_sequences_with_stride(
    X_val_selected, y_val_filtered, val_dates, sequence_length, stride
)

# Test sequences
X_test_seq, y_test_seq, test_seq_dates = create_sequences_with_stride(
    X_test_selected, y_test_filtered, test_dates, sequence_length, stride
)

print(f"‚úÖ Sequences created:")
print(f"   Train: {X_train_seq.shape} ‚Üí {len(y_train_seq)} labels")
print(f"   Validation: {X_val_seq.shape} ‚Üí {len(y_val_seq)} labels")
print(f"   Test: {X_test_seq.shape} ‚Üí {len(y_test_seq)} labels")

# ===== STEP 4: SEQUENCE VALIDATION =====
print(f"\nüîç Sequence validation:")

# Check for temporal consistency
print("Temporal consistency checks:")
for name, dates_array in [("Train", train_seq_dates), ("Val", val_seq_dates), ("Test", test_seq_dates)]:
    if len(dates_array) > 0:
        min_date = pd.to_datetime(dates_array.min())
        max_date = pd.to_datetime(dates_array.max())
        print(f"   {name}: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")

# Check class distribution in sequences
print(f"\nClass distribution in sequences:")
for name, labels in [("Train", y_train_seq), ("Val", y_val_seq), ("Test", y_test_seq)]:
    if len(labels) > 0:
        up_pct = np.mean(labels == 1) * 100
        down_pct = np.mean(labels == 0) * 100
        print(f"   {name}: UP={up_pct:.1f}%, DOWN={down_pct:.1f}%")

# Calculate reduction in samples due to stride
original_possible_sequences = len(X_train_selected) + len(X_val_selected) + len(X_test_selected) - 3*sequence_length
actual_sequences = len(y_train_seq) + len(y_val_seq) + len(y_test_seq)
reduction_factor = original_possible_sequences / actual_sequences if actual_sequences > 0 else 0

print(f"\nüìä Sequence efficiency:")
print(f"   Possible sequences (stride=1): ~{original_possible_sequences}")
print(f"   Actual sequences (stride={stride}): {actual_sequences}")
print(f"   Reduction factor: {reduction_factor:.1f}x")
print(f"   Memory/computation savings: {(1-1/reduction_factor)*100:.1f}%")

print("\n‚úÖ SEQUENCE CREATION WITH STRIDE COMPLETE!")


In [None]:

import tensorflow as tf
from tensorflow.keras import layers, Model, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import AUC

print("üèóÔ∏è ENHANCED MODEL ARCHITECTURE WITH STACKED LSTMS")
print("="*60)


tf.random.set_seed(42)
np.random.seed(42)


def build_enhanced_lstm_cnn_classifier(input_shape):
    """
    Build enhanced LSTM-CNN hybrid with stacked architecture
    """
    inputs = layers.Input(shape=input_shape)


    x = layers.Conv1D(filters=64, kernel_size=5, padding='same',
                      kernel_regularizer=regularizers.l2(1e-4))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.SpatialDropout1D(0.15)(x)  # Reduced from 0.2

    # Second CNN block
    x = layers.Conv1D(filters=32, kernel_size=3, padding='same',
                      kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.SpatialDropout1D(0.15)(x)  # Reduced from 0.2

    # ===== STACKED LSTM LAYERS =====
    # First LSTM layer (return sequences for stacking)
    x = layers.LSTM(units=128,  # Doubled from 64
                    dropout=0.15,  # Reduced from 0.3
                    recurrent_dropout=0.1,  # Reduced from 0.2
                    kernel_regularizer=regularizers.l2(1e-4),
                    return_sequences=True)(x)  # Keep sequences for stacking


    x = layers.LSTM(units=128,  # Doubled from 64
                    dropout=0.15,  # Reduced from 0.3
                    recurrent_dropout=0.1,  # Reduced from 0.2
                    kernel_regularizer=regularizers.l2(1e-4),
                    return_sequences=False)(x)  # Final output


    x = layers.Dense(64, activation='relu',  # Doubled from 32
                     kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)  # Reduced from 0.3

    # Second dense block
    x = layers.Dense(32, activation='relu',
                     kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.2)(x)  # Reduced from 0.2

    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model


input_shape = (sequence_length, len(feature_columns))
print(f"Enhanced input shape: {input_shape}")

enhanced_model = build_enhanced_lstm_cnn_classifier(input_shape)

print(f"‚úÖ Enhanced model architecture:")
print(f"   üîÑ Stacked LSTMs: 128 + 128 units (doubled capacity)")
print(f"   üìâ Reduced dropout: 0.15-0.2 (prevent underfitting)")
print(f"   üéØ Enhanced dense: 64 + 32 units")
print(f"   üìä Parameters: {enhanced_model.count_params():,}")


enhanced_class_weights = {0: 1.3, 1: 0.9}

enhanced_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4, clipnorm=1.0),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall', AUC(name='auc')]
)

print(f"‚úÖ Model compiled with enhanced configuration")


enhanced_callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=25,  # Increased from 15
        restore_best_weights=True,
        verbose=1,
        min_delta=1e-5
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=15,  # Increased from 8
        min_lr=1e-6,
        verbose=1
    ),
    # Dual checkpointing with enhanced model names
    ModelCheckpoint(
        'enhanced_model_precision.keras',
        monitor='val_precision',
        save_best_only=True,
        verbose=1,
        mode='max'
    ),
    ModelCheckpoint(
        'enhanced_model_auc.keras',
        monitor='val_auc',
        save_best_only=True,
        verbose=1,
        mode='max'
    )
]

print(f"‚úÖ Enhanced callbacks configured:")
print(f"   üìà Increased early stopping patience: 25 epochs")
print(f"   üîÑ Increased LR reduction patience: 15 epochs")
print(f"   üíæ Dual checkpointing maintained")

enhanced_model.summary()

print("\n‚úÖ ENHANCED MODEL ARCHITECTURE READY!")


In [None]:

import tensorflow as tf
from tensorflow.keras import layers, Model, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import AUC

print("üèóÔ∏è ENHANCED MODEL ARCHITECTURE WITH STACKED LSTMS")
print("="*60)

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# ===== STEP 1: ENHANCED MODEL ARCHITECTURE =====
def build_enhanced_lstm_cnn_classifier(input_shape):
    """
    Build enhanced LSTM-CNN hybrid with stacked architecture
    """
    inputs = layers.Input(shape=input_shape)

    # ===== ENHANCED CNN LAYERS =====
    # First CNN block
    x = layers.Conv1D(filters=64, kernel_size=5, padding='same',
                      kernel_regularizer=regularizers.l2(1e-4))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.SpatialDropout1D(0.15)(x)  # Reduced from 0.2

    # Second CNN block
    x = layers.Conv1D(filters=32, kernel_size=3, padding='same',
                      kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.SpatialDropout1D(0.15)(x)  # Reduced from 0.2


    x = layers.LSTM(units=128,  # Doubled from 64
                    dropout=0.15,  # Reduced from 0.3
                    recurrent_dropout=0.1,  # Reduced from 0.2
                    kernel_regularizer=regularizers.l2(1e-4),
                    return_sequences=True)(x)  # Keep sequences for stacking


    x = layers.LSTM(units=128,  # Doubled from 64
                    dropout=0.15,  # Reduced from 0.3
                    recurrent_dropout=0.1,  # Reduced from 0.2
                    kernel_regularizer=regularizers.l2(1e-4),
                    return_sequences=False)(x)  # Final output


    x = layers.Dense(64, activation='relu',  # Doubled from 32
                     kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)  # Reduced from 0.3


    x = layers.Dense(32, activation='relu',
                     kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.2)(x)  # Reduced from 0.2

    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model


# enhanced_model = build_enhanced_lstm_cnn_classifier(input_shape)

# print(f"‚úÖ Enhanced model architecture:")
# print(f"   üîÑ Stacked LSTMs: 128 + 128 units (doubled capacity)")
# print(f"   üìâ Reduced dropout: 0.15-0.2 (prevent underfitting)")
# print(f"   üéØ Enhanced dense: 64 + 32 units")
# print(f"   üìä Parameters: {enhanced_model.count_params():,}")

# ===== STEP 3: COMPILE WITH OPTIMIZED SETTINGS =====
# Safer class weights for enhanced model
enhanced_class_weights = {0: 1.3, 1: 0.9}

# The model is compiled inside the walk-forward loop with the correct input_shape and optimizer.

# ===== STEP 4: ENHANCED CALLBACKS WITH LONGER PATIENCE =====
enhanced_callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=25,  # Increased from 15
        restore_best_weights=True,
        verbose=1,
        min_delta=1e-5
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=15,  # Increased from 8
        min_lr=1e-6,
        verbose=1
    ),
    # Dual checkpointing with enhanced model names
    ModelCheckpoint(
        'enhanced_model_precision.keras',
        monitor='val_precision',
        save_best_only=True,
        verbose=1,
        mode='max'
    ),
    ModelCheckpoint(
        'enhanced_model_auc.keras',
        monitor='val_auc',
        save_best_only=True,
        verbose=1,
        mode='max'
    )
]

print(f"‚úÖ Enhanced callbacks configured:")
print(f"   üìà Increased early stopping patience: 25 epochs")
print(f"   üîÑ Increased LR reduction patience: 15 epochs")
print(f"   üíæ Dual checkpointing maintained")

# Model summary is printed inside the loop after building the model for each fold.
# enhanced_model.summary()

print("\n‚úÖ ENHANCED MODEL ARCHITECTURE READY!")

In [None]:


import os, io, json, random
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers as L, models, regularizers, callbacks
import tensorflow.keras.backend as K
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    roc_auc_score, matthews_corrcoef, accuracy_score,
    precision_score, recall_score, f1_score, confusion_matrix
)

# ----------------- REPRODUCIBILITY -----------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# ----------------- FINAL PARAMETERS -----------------
USE_VOL_ADJUSTED = True
LABEL_THRESHOLD = 0.005  # 0.5%
N_SPLITS = 5
PURGE_SAMPLES = 60
SEQ_LEN = 45
TOP_K_FEATURES = 30
N_ENSEMBLE = 3
MIN_TRADE_MCC = 0.15
MIN_TRADE_ROC = 0.60

os.makedirs("artifacts_final", exist_ok=True)

print("="*70)
print("LSTM-CNN v10 FINAL: RAW ENSEMBLE (NO CALIBRATION)")
print("="*70)
print(f"‚úÖ External features integrated")
print(f"‚úÖ Volatility-adjusted labeling (0.5% threshold)")
print(f"‚úÖ Raw ensemble predictions (NO isotonic calibration)")
print(f"‚úÖ Top features: {TOP_K_FEATURES}")
print(f"‚úÖ Min tradeable: MCC {MIN_TRADE_MCC}, ROC {MIN_TRADE_ROC}")
print("="*70 + "\n")

# ----------------- FOCAL LOSS -----------------
def focal_loss(gamma=2.0, alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_true = K.cast(y_true, tf.float32)
        y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())

        ce = -y_true * K.log(y_pred)
        weight = alpha * y_true * K.pow(1 - y_pred, gamma)
        focal_loss_value = weight * ce

        ce_neg = -(1 - y_true) * K.log(1 - y_pred)
        weight_neg = (1 - alpha) * (1 - y_true) * K.pow(y_pred, gamma)
        focal_loss_value += weight_neg * ce_neg

        return K.mean(focal_loss_value)

    return focal_loss_fixed

# ----------------- BALANCED BATCH GENERATOR -----------------
class BalancedBatchGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size=16, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.idx_0 = np.where(y == 0)[0]
        self.idx_1 = np.where(y == 1)[0]

        self.samples_per_class = min(len(self.idx_0), len(self.idx_1))
        self.batches_per_class = self.batch_size // 2
        self.n_batches = max(1, self.samples_per_class // self.batches_per_class)

        self.on_epoch_end()

    def __len__(self):
        return self.n_batches

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idx_0)
            np.random.shuffle(self.idx_1)

    def __getitem__(self, idx):
        start = idx * self.batches_per_class
        end = min(start + self.batches_per_class, self.samples_per_class)

        batch_idx_0 = self.idx_0[start:end]
        batch_idx_1 = self.idx_1[start:end]

        if len(batch_idx_0) < self.batches_per_class:
            batch_idx_0 = np.pad(batch_idx_0, (0, self.batches_per_class - len(batch_idx_0)), mode='wrap')
        if len(batch_idx_1) < self.batches_per_class:
            batch_idx_1 = np.pad(batch_idx_1, (0, self.batches_per_class - len(batch_idx_1)), mode='wrap')

        batch_idx = np.concatenate([batch_idx_0, batch_idx_1])
        np.random.shuffle(batch_idx)

        return self.X[batch_idx], self.y[batch_idx]

# ----------------- MODEL -----------------
def build_lstm_cnn_final(input_shape):
    """Final optimized LSTM-CNN: ~70k params"""
    inp = L.Input(shape=input_shape)

    x = L.LayerNormalization()(inp)

    # CNN block
    x = L.Conv1D(48, 3, padding='same',
                 kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.BatchNormalization()(x)
    x = L.ReLU()(x)
    x = L.SpatialDropout1D(0.2)(x)

    # Bidirectional LSTM
    x = L.Bidirectional(
        L.LSTM(64, dropout=0.2, recurrent_dropout=0.15,
               kernel_regularizer=regularizers.l2(1e-3))
    )(x)

    # Dense layers
    x = L.Dense(48, activation='relu',
                kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.BatchNormalization()(x)
    x = L.Dropout(0.3)(x)

    x = L.Dense(24, activation='relu',
                kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.Dropout(0.3)(x)

    out = L.Dense(1, activation='sigmoid')(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(5e-4, clipnorm=1.0),
        loss=focal_loss(gamma=2.0, alpha=0.25),
        metrics=['accuracy', tf.keras.metrics.AUC(name='AUC')]
    )
    return model

# ----------------- THRESHOLD SEARCH -----------------
def pick_best_threshold(y_true, y_prob):
    """Find optimal threshold by MCC"""
    best_thr, best_mcc = 0.5, -1
    for thr in np.linspace(0.40, 0.60, 21):
        y_pred = (y_prob >= thr).astype(int)
        mcc = matthews_corrcoef(y_true, y_pred)
        if mcc > best_mcc:
            best_mcc, best_thr = mcc, thr
    return best_thr, best_mcc

# ----------------- DATA CLEANING -----------------
def clean_data(df):
    """Robust data cleaning"""
    df = df.replace([np.inf, -np.inf], np.nan)
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    # Clip extremes
    for col in numeric_cols:
        if df[col].dtype in [np.float64, np.float32]:
            lower = df[col].quantile(0.001)
            upper = df[col].quantile(0.999)
            df[col] = df[col].clip(lower, upper)

    # Fill NaN
    df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(method='bfill')
    df = df.dropna()

    return df

# ----------------- DATA PREP -----------------
def prepare_final_dataset():
    assert 'nifty_data' in globals(), "‚ùå Missing 'nifty_data'"
    print("üìä Preparing final dataset...\n")

    df = nifty_data.copy()
    df = clean_data(df)

    df = df.dropna(subset=['Close'])
    df['Next_Return'] = df['Close'].pct_change().shift(-1)

    if USE_VOL_ADJUSTED and 'ATR_14' in df.columns:
        print("  ‚úì ATR-adjusted labeling")
        df['ATR_Pct'] = df['ATR_14'] / (df['Close'] + 1e-8)
        df['Vol_Adj_Move'] = df['Next_Return'] / (df['ATR_Pct'] + 1e-6)
        df['Vol_Adj_Move'] = df['Vol_Adj_Move'].clip(-10, 10)

        threshold_vol_adj = 0.3
        df['label'] = np.where(df['Vol_Adj_Move'] > threshold_vol_adj, 1,
                       np.where(df['Vol_Adj_Move'] < -threshold_vol_adj, 0, np.nan))
    else:
        print(f"  ‚úì {LABEL_THRESHOLD*100:.1f}% threshold labeling")
        df['label'] = np.where(df['Next_Return'] > LABEL_THRESHOLD, 1,
                       np.where(df['Next_Return'] < -LABEL_THRESHOLD, 0, np.nan))

    df = df.dropna(subset=['label']).copy()
    y_full = df['label'].astype(int).values
    dates = pd.to_datetime(df['Date']).values

    # Feature selection
    exclude = ['Date','Symbol','Open','High','Low','Close','Volume',
               'Next_Return','label','ATR_Pct','Vol_Adj_Move',
               'INDIAVIX','USDINR','OIL','SP500','HSI','US10Y',
               'SMA_50','SMA_200']

    numeric_cols = df.select_dtypes(include=[np.number]).columns
    features = [c for c in numeric_cols if c not in exclude]

    X_full = df[features].values
    X_full = np.nan_to_num(X_full, nan=0.0, posinf=0.0, neginf=0.0)

    up_pct = np.mean(y_full == 1)
    print(f"‚úÖ Dataset: X={X_full.shape}, y={y_full.shape}")
    print(f"   UP: {np.sum(y_full==1)} ({up_pct*100:.1f}%), "
          f"DOWN: {np.sum(y_full==0)} ({(1-up_pct)*100:.1f}%)\n")

    return X_full, y_full, dates, features

# ----------------- SPLITS -----------------
def purged_time_series_splits(dates, n_splits=5, purge=60):
    n = len(dates)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    splits = []
    for train_idx, test_idx in tscv.split(np.arange(n)):
        if purge > 0 and len(train_idx) > purge:
            train_idx = train_idx[:-purge]
        if len(test_idx) > purge//2:
            test_idx = test_idx[purge//2:]
        splits.append((train_idx, test_idx))
    return splits

# ----------------- CREATE SEQUENCES -----------------
def create_sequences(X, y, dates, seq_len):
    Xs, ys, ds = [], [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i:i+seq_len])
        ys.append(y[i+seq_len])
        ds.append(dates[i+seq_len])
    return np.array(Xs), np.array(ys), np.array(ds)

# ============================================================
# MAIN WALK-FORWARD VALIDATION
# ============================================================

X_full, y_full, dates_all, feature_names = prepare_final_dataset()
splits = purged_time_series_splits(dates_all, n_splits=N_SPLITS, purge=PURGE_SAMPLES)

# Storage
oof_predictions_raw = []  # RAW ensemble predictions
oof_true_labels = []
fold_summaries = []
all_selected_features = []

print(f"{'='*70}")
print(f"WALK-FORWARD VALIDATION: {N_SPLITS} FOLDS")
print(f"{'='*70}\n")

for fold, (tr_idx, te_idx) in enumerate(splits):
    print(f"‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê FOLD {fold+1}/{N_SPLITS} ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó")

    X_tr_base, X_val_base = X_full[tr_idx], X_full[te_idx]
    y_tr_base, y_val_base = y_full[tr_idx], y_full[te_idx]
    dates_tr_base, dates_val_base = dates_all[tr_idx], dates_all[te_idx]

    # Safety cleaning
    X_tr_base = np.nan_to_num(X_tr_base, nan=0.0, posinf=0.0, neginf=0.0)
    X_val_base = np.nan_to_num(X_val_base, nan=0.0, posinf=0.0, neginf=0.0)

    # Scaling
    scaler = RobustScaler().fit(X_tr_base)
    X_tr_s = scaler.transform(X_tr_base)
    X_val_s = scaler.transform(X_val_base)

    X_tr_s = np.nan_to_num(X_tr_s, nan=0.0, posinf=0.0, neginf=0.0)
    X_val_s = np.nan_to_num(X_val_s, nan=0.0, posinf=0.0, neginf=0.0)

    # Feature selection
    selector = SelectKBest(mutual_info_classif, k=min(TOP_K_FEATURES, X_tr_s.shape[1]))
    X_tr_sel = selector.fit_transform(X_tr_s, y_tr_base)
    X_val_sel = selector.transform(X_val_s)

    X_tr_sel = np.nan_to_num(X_tr_sel, nan=0.0, posinf=0.0, neginf=0.0)
    X_val_sel = np.nan_to_num(X_val_sel, nan=0.0, posinf=0.0, neginf=0.0)

    # Track selected features
    if fold == 0:
        selected_mask = selector.get_support()
        selected_features = [feature_names[i] for i in range(len(feature_names)) if selected_mask[i]]
        all_selected_features = selected_features
        print(f"‚îÇ Top 5 features: {', '.join(selected_features[:5])}")

    # Sequences
    X_tr_seq, y_tr_seq, _ = create_sequences(X_tr_sel, y_tr_base, dates_tr_base, SEQ_LEN)
    X_val_seq, y_val_seq, dates_val_seq = create_sequences(X_val_sel, y_val_base, dates_val_base, SEQ_LEN)

    print(f"‚îÇ Train: {X_tr_seq.shape[0]} seq | Val: {X_val_seq.shape[0]} seq")

    if len(X_tr_seq) < 50 or len(X_val_seq) < 20:
        print(f"‚îÇ ‚ö†Ô∏è  SKIP: Insufficient data")
        print(f"‚ïö{'‚ïê'*40}‚ïù\n")
        continue

    print(f"‚îÇ Train UP: {np.mean(y_tr_seq):.1%} | Val UP: {np.mean(y_val_seq):.1%}")

    # ENSEMBLE
    print(f"‚îÇ Training {N_ENSEMBLE} models...")
    ensemble_preds = []

    for seed_offset in range(N_ENSEMBLE):
        tf.random.set_seed(SEED + seed_offset)
        np.random.seed(SEED + seed_offset)

        model = build_lstm_cnn_final((SEQ_LEN, X_tr_seq.shape[2]))

        if fold == 0 and seed_offset == 0:
            print(f"‚îÇ Model: {model.count_params():,} params")

        train_gen = BalancedBatchGenerator(X_tr_seq, y_tr_seq, batch_size=16, shuffle=True)
        val_data = (X_val_seq, y_val_seq)

        cb = [
            callbacks.EarlyStopping(monitor='val_AUC', patience=25,
                                   restore_best_weights=True, mode='max', verbose=0),
            callbacks.ReduceLROnPlateau(monitor='val_AUC', factor=0.5, patience=12,
                                       min_lr=1e-6, verbose=0, mode='max')
        ]

        model.fit(train_gen, validation_data=val_data, epochs=120, verbose=0, callbacks=cb)

        pred = model.predict(X_val_seq, verbose=0).ravel()
        ensemble_preds.append(pred)

    # RAW ensemble average (NO CALIBRATION)
    p_val_raw = np.mean(ensemble_preds, axis=0)

    # Find best threshold
    thr, mcc_thr = pick_best_threshold(y_val_seq, p_val_raw)
    y_pred = (p_val_raw >= thr).astype(int)

    # Metrics
    mcc = matthews_corrcoef(y_val_seq, y_pred)
    acc = accuracy_score(y_val_seq, y_pred)
    prec = precision_score(y_val_seq, y_pred, zero_division=0)
    rec = recall_score(y_val_seq, y_pred, zero_division=0)
    f1 = f1_score(y_val_seq, y_pred, zero_division=0)

    try:
        roc = roc_auc_score(y_val_seq, p_val_raw)
    except:
        roc = 0.5

    pred_up_pct = np.mean(y_pred == 1)

    print(f"‚îÇ")
    print(f"‚îÇ üìä RESULTS (RAW ENSEMBLE):")
    print(f"‚îÇ   Threshold: {thr:.3f}")
    print(f"‚îÇ   Pred UP: {pred_up_pct:.1%}")
    print(f"‚îÇ   MCC: {mcc:.4f} {'‚úÖ' if mcc >= MIN_TRADE_MCC else '‚ùå'}")
    print(f"‚îÇ   ROC-AUC: {roc:.4f} {'‚úÖ' if roc >= MIN_TRADE_ROC else '‚ùå'}")
    print(f"‚îÇ   Acc: {acc:.3f} | Prec: {prec:.3f} | Rec: {rec:.3f} | F1: {f1:.3f}")

    status = "TRADE ‚úÖ" if (mcc >= MIN_TRADE_MCC and roc >= MIN_TRADE_ROC) else "NO_TRADE ‚ùå"
    print(f"‚îÇ   ‚Üí {status}")
    print(f"‚ïö{'‚ïê'*40}‚ïù\n")

    # Store RAW predictions
    oof_predictions_raw.extend(p_val_raw.tolist())
    oof_true_labels.extend(y_val_seq.tolist())

    fold_summaries.append({
        "fold": fold + 1,
        "status": status,
        "mcc": float(mcc),
        "roc_auc": float(roc),
        "accuracy": float(acc),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "threshold": float(thr),
        "pred_up_pct": float(pred_up_pct),
        "n_val": int(len(y_val_seq)),
        "date_range": f"{pd.to_datetime(dates_val_seq[0]).date()} to {pd.to_datetime(dates_val_seq[-1]).date()}"
    })



print(f"\n{'='*70}")
print("COMPUTING FINAL METRICS (RAW ENSEMBLE)")
print(f"{'='*70}\n")

oof_prob_raw = np.array(oof_predictions_raw)
oof_true = np.array(oof_true_labels)

# Find final threshold on RAW predictions
thr_final, mcc_final = pick_best_threshold(oof_true, oof_prob_raw)
y_pred_final = (oof_prob_raw >= thr_final).astype(int)

# Final metrics
acc_final = accuracy_score(oof_true, y_pred_final)
prec_final = precision_score(oof_true, y_pred_final, zero_division=0)
rec_final = recall_score(oof_true, y_pred_final, zero_division=0)
f1_final = f1_score(oof_true, y_pred_final, zero_division=0)
roc_final = roc_auc_score(oof_true, oof_prob_raw)

pred_up_final = np.mean(y_pred_final == 1)

print(f"{'='*70}")
print(f"FINAL OUT-OF-SAMPLE RESULTS ({len(oof_true)} samples)")
print(f"{'='*70}")
print(f"Threshold:   {thr_final:.3f}")
print(f"Pred UP:     {pred_up_final:.1%}")
print(f"")
print(f"MCC:         {mcc_final:.4f}  {'üéØ EXCELLENT' if mcc_final > 0.20 else '‚úÖ GOOD' if mcc_final > 0.15 else '‚ö†Ô∏è MARGINAL'}")
print(f"ROC-AUC:     {roc_final:.4f}  {'üéØ EXCELLENT' if roc_final > 0.65 else '‚úÖ GOOD' if roc_final > 0.60 else '‚ö†Ô∏è MARGINAL'}")
print(f"")
print(f"Accuracy:    {acc_final:.3f}")
print(f"Precision:   {prec_final:.3f}")
print(f"Recall:      {rec_final:.3f}")
print(f"F1-Score:    {f1_final:.3f}")
print(f"{'='*70}\n")

# ============================================================
# FOLD DIAGNOSTICS
# ============================================================

fold_df = pd.DataFrame(fold_summaries)
tradeable_count = sum('TRADE ‚úÖ' in str(s['status']) for s in fold_summaries)

print("üìã FOLD-BY-FOLD PERFORMANCE:")
print(fold_df[['fold', 'status', 'mcc', 'roc_auc', 'precision', 'recall', 'f1']])
print(f"\n{'='*70}")
print(f"‚úÖ TRADEABLE FOLDS: {tradeable_count}/{len(fold_summaries)} ({tradeable_count/len(fold_summaries)*100:.0f}%)")
print(f"{'='*70}\n")

# ============================================================
# SAVE ARTIFACTS
# ============================================================

fold_df.to_csv("artifacts_final/fold_results_v10.csv", index=False)
fold_df.to_json("artifacts_final/fold_results_v10.json", orient='records', indent=2)

np.save("artifacts_final/oof_predictions_raw_v10.npy", oof_prob_raw)
np.save("artifacts_final/oof_true_labels_v10.npy", oof_true)
np.save("artifacts_final/oof_predictions_final_v10.npy", y_pred_final)

# Save summary
summary = {
    "version": "v10_final",
    "configuration": {
        "no_calibration": True,
        "raw_ensemble_only": True,
        "label_threshold": f"{LABEL_THRESHOLD*100}%",
        "top_features": TOP_K_FEATURES,
        "ensemble_size": N_ENSEMBLE,
        "sequence_length": SEQ_LEN
    },
    "final_metrics": {
        "mcc": float(mcc_final),
        "roc_auc": float(roc_final),
        "accuracy": float(acc_final),
        "precision": float(prec_final),
        "recall": float(rec_final),
        "f1": float(f1_final),
        "threshold": float(thr_final)
    },
    "fold_summary": {
        "total_folds": len(fold_summaries),
        "tradeable_folds": tradeable_count,
        "tradeable_percentage": float(tradeable_count/len(fold_summaries)*100)
    },
    "top_features": all_selected_features[:10],
    "is_tradeable": bool(mcc_final > 0.15 and roc_final > 0.60)
}

with open("artifacts_final/final_summary_v10.json", "w") as f:
    json.dump(summary, f, indent=2)



fig, axes = plt.subplots(2, 3, figsize=(18, 10))
sns.set_style("whitegrid")

# 1. Fold MCCs
ax1 = axes[0, 0]
fold_mccs = [f['mcc'] for f in fold_summaries]
colors = ['green' if 'TRADE ‚úÖ' in str(f['status']) else 'red' for f in fold_summaries]
ax1.bar(range(1, len(fold_mccs)+1), fold_mccs, color=colors, alpha=0.7, edgecolor='black')
ax1.axhline(y=MIN_TRADE_MCC, color='orange', linestyle='--', linewidth=2, label=f'Min ({MIN_TRADE_MCC})')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.set_xlabel('Fold', fontweight='bold', fontsize=11)
ax1.set_ylabel('MCC', fontweight='bold', fontsize=11)
ax1.set_title('Fold-wise MCC (Green=Tradeable)', fontweight='bold', fontsize=12)
ax1.legend()
ax1.grid(alpha=0.3)

# 2. ROC-AUC
ax2 = axes[0, 1]
fold_rocs = [f['roc_auc'] for f in fold_summaries]
ax2.plot(range(1, len(fold_rocs)+1), fold_rocs, 'o-', linewidth=2, markersize=8, color='seagreen')
ax2.axhline(y=MIN_TRADE_ROC, color='orange', linestyle='--', linewidth=2, label=f'Target ({MIN_TRADE_ROC})')
ax2.axhline(y=0.5, color='gray', linestyle='-', alpha=0.3, label='Random')
ax2.fill_between(range(1, len(fold_rocs)+1), 0.5, MIN_TRADE_ROC, alpha=0.1, color='red')
ax2.fill_between(range(1, len(fold_rocs)+1), MIN_TRADE_ROC, 1.0, alpha=0.1, color='green')
ax2.set_xlabel('Fold', fontweight='bold', fontsize=11)
ax2.set_ylabel('ROC-AUC', fontweight='bold', fontsize=11)
ax2.set_title('Fold-wise ROC-AUC', fontweight='bold', fontsize=12)
ax2.legend()
ax2.grid(alpha=0.3)

# 3. Precision vs Recall
ax3 = axes[0, 2]
fold_precs = [f['precision'] for f in fold_summaries]
fold_recs = [f['recall'] for f in fold_summaries]
ax3.scatter(fold_recs, fold_precs, s=100, alpha=0.7, c=colors, edgecolor='black')
for i, fold in enumerate(fold_summaries):
    ax3.annotate(f"F{fold['fold']}", (fold_recs[i], fold_precs[i]), fontsize=9, ha='center')
ax3.set_xlabel('Recall', fontweight='bold', fontsize=11)
ax3.set_ylabel('Precision', fontweight='bold', fontsize=11)
ax3.set_title('Precision-Recall Trade-off', fontweight='bold', fontsize=12)
ax3.grid(alpha=0.3)

# 4. Prediction distribution
ax4 = axes[1, 0]
ax4.hist(oof_prob_raw, bins=40, alpha=0.7, edgecolor='black', color='steelblue')
ax4.axvline(x=thr_final, color='red', linestyle='--', linewidth=2, label=f'Threshold ({thr_final:.3f})')
ax4.set_xlabel('Raw Ensemble Probability', fontweight='bold', fontsize=11)
ax4.set_ylabel('Frequency', fontweight='bold', fontsize=11)
ax4.set_title('OOF Prediction Distribution', fontweight='bold', fontsize=12)
ax4.legend()
ax4.grid(alpha=0.3)

# 5. Confusion matrix
ax5 = axes[1, 1]
cm = confusion_matrix(oof_true, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax5,
            xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'])
ax5.set_xlabel('Predicted', fontweight='bold', fontsize=11)
ax5.set_ylabel('Actual', fontweight='bold', fontsize=11)
ax5.set_title('Confusion Matrix (OOF)', fontweight='bold', fontsize=12)

# 6. Metrics summary
ax6 = axes[1, 2]
ax6.axis('off')
metrics_text = f"""
FINAL METRICS SUMMARY

Out-of-Sample: {len(oof_true)} samples

MCC:        {mcc_final:.4f}
ROC-AUC:    {roc_final:.4f}
Accuracy:   {acc_final:.3f}
Precision:  {prec_final:.3f}
Recall:     {rec_final:.3f}
F1-Score:   {f1_final:.3f}

Tradeable Folds: {tradeable_count}/{len(fold_summaries)}

Status: {'‚úÖ TRADEABLE' if mcc_final > 0.15 and roc_final > 0.60 else '‚ö†Ô∏è MARGINAL'}
"""
ax6.text(0.1, 0.5, metrics_text, fontsize=11, family='monospace',
         verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

plt.tight_layout()
plt.savefig("artifacts_final/final_results_v10.png", dpi=150, bbox_inches='tight')
plt.close()

print(f"üíæ All artifacts saved to ./artifacts_final/")
print(f"   - fold_results_v10.csv/json")
print(f"   - final_summary_v10.json")
print(f"   - OOF predictions (.npy)")
print(f"   - final_results_v10.png\n")

print(f"{'='*70}")
print("üéØ V10 FINAL COMPLETE (NO CALIBRATION)")
print(f"{'='*70}")


In [None]:


import os, json, joblib
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers as L, models, regularizers, callbacks
import tensorflow.keras.backend as K
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from datetime import datetime, timedelta

# Reproducibility
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("="*70)
print("PRODUCTION MODEL TRAINING (BEST FOLD CONFIGURATION)")
print("="*70 + "\n")



# Load fold results
fold_df = pd.read_json("artifacts_final/fold_results_v10.json")

# Best fold = highest MCC among tradeable folds
tradeable_folds = fold_df[fold_df['status'] == 'TRADE ‚úÖ']

if len(tradeable_folds) == 0:
    print("‚ö†Ô∏è No tradeable folds found. Using best MCC fold instead.")
    best_fold_idx = fold_df['mcc'].idxmax()
else:
    best_fold_idx = tradeable_folds['mcc'].idxmax()

best_fold = fold_df.loc[best_fold_idx]

print(f"üéØ BEST FOLD IDENTIFIED: Fold {best_fold['fold']}")
print(f"   MCC: {best_fold['mcc']:.4f}")
print(f"   ROC-AUC: {best_fold['roc_auc']:.4f}")
print(f"   Precision: {best_fold['precision']:.3f}")
print(f"   Recall: {best_fold['recall']:.3f}")
print(f"   Threshold: {best_fold['threshold']:.3f}")
print(f"   Status: {best_fold['status']}\n")



SEQ_LEN = 45
TOP_K_FEATURES = 30
N_ENSEMBLE = 3
BEST_THRESHOLD = best_fold['threshold']

print("üìã Production Configuration:")
print(f"   Sequence Length: {SEQ_LEN}")
print(f"   Top Features: {TOP_K_FEATURES}")
print(f"   Ensemble Size: {N_ENSEMBLE}")
print(f"   Decision Threshold: {BEST_THRESHOLD:.3f}\n")



def focal_loss(gamma=2.0, alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_true = K.cast(y_true, tf.float32)
        y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())

        ce = -y_true * K.log(y_pred)
        weight = alpha * y_true * K.pow(1 - y_pred, gamma)
        focal_loss_value = weight * ce

        ce_neg = -(1 - y_true) * K.log(1 - y_pred)
        weight_neg = (1 - alpha) * (1 - y_true) * K.pow(y_pred, gamma)
        focal_loss_value += weight_neg * ce_neg

        return K.mean(focal_loss_value)

    return focal_loss_fixed

class BalancedBatchGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size=16, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.idx_0 = np.where(y == 0)[0]
        self.idx_1 = np.where(y == 1)[0]

        self.samples_per_class = min(len(self.idx_0), len(self.idx_1))
        self.batches_per_class = self.batch_size // 2
        self.n_batches = max(1, self.samples_per_class // self.batches_per_class)

        self.on_epoch_end()

    def __len__(self):
        return self.n_batches

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idx_0)
            np.random.shuffle(self.idx_1)

    def __getitem__(self, idx):
        start = idx * self.batches_per_class
        end = min(start + self.batches_per_class, self.samples_per_class)

        batch_idx_0 = self.idx_0[start:end]
        batch_idx_1 = self.idx_1[start:end]

        if len(batch_idx_0) < self.batches_per_class:
            batch_idx_0 = np.pad(batch_idx_0, (0, self.batches_per_class - len(batch_idx_0)), mode='wrap')
        if len(batch_idx_1) < self.batches_per_class:
            batch_idx_1 = np.pad(batch_idx_1, (0, self.batches_per_class - len(batch_idx_1)), mode='wrap')

        batch_idx = np.concatenate([batch_idx_0, batch_idx_1])
        np.random.shuffle(batch_idx)

        return self.X[batch_idx], self.y[batch_idx]

def build_production_model(input_shape):
    """Production LSTM-CNN model"""
    inp = L.Input(shape=input_shape)

    x = L.LayerNormalization()(inp)

    x = L.Conv1D(48, 3, padding='same',
                 kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.BatchNormalization()(x)
    x = L.ReLU()(x)
    x = L.SpatialDropout1D(0.2)(x)

    x = L.Bidirectional(
        L.LSTM(64, dropout=0.2, recurrent_dropout=0.15,
               kernel_regularizer=regularizers.l2(1e-3))
    )(x)

    x = L.Dense(48, activation='relu',
                kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.BatchNormalization()(x)
    x = L.Dropout(0.3)(x)

    x = L.Dense(24, activation='relu',
                kernel_regularizer=regularizers.l2(1e-3))(x)
    x = L.Dropout(0.3)(x)

    out = L.Dense(1, activation='sigmoid')(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(5e-4, clipnorm=1.0),
        loss=focal_loss(gamma=2.0, alpha=0.25),
        metrics=['accuracy', tf.keras.metrics.AUC(name='AUC')]
    )
    return model

def create_sequences(X, y, dates, seq_len):
    Xs, ys, ds = [], [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i:i+seq_len])
        ys.append(y[i+seq_len])
        ds.append(dates[i+seq_len])
    return np.array(Xs), np.array(ys), np.array(ds)



print("üìä Preparing full dataset for production training...\n")

# Reload and clean data
df = nifty_data.copy()
df = df.replace([np.inf, -np.inf], np.nan)
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].dtype in [np.float64, np.float32]:
        lower = df[col].quantile(0.001)
        upper = df[col].quantile(0.999)
        df[col] = df[col].clip(lower, upper)
df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(method='bfill')
df = df.dropna()

# Labels
df['Next_Return'] = df['Close'].pct_change().shift(-1)

if 'ATR_14' in df.columns:
    df['ATR_Pct'] = df['ATR_14'] / (df['Close'] + 1e-8)
    df['Vol_Adj_Move'] = df['Next_Return'] / (df['ATR_Pct'] + 1e-6)
    df['Vol_Adj_Move'] = df['Vol_Adj_Move'].clip(-10, 10)
    df['label'] = np.where(df['Vol_Adj_Move'] > 0.3, 1,
                   np.where(df['Vol_Adj_Move'] < -0.3, 0, np.nan))
else:
    df['label'] = np.where(df['Next_Return'] > 0.005, 1,
                   np.where(df['Next_Return'] < -0.005, 0, np.nan))

df = df.dropna(subset=['label']).copy()
y_full = df['label'].astype(int).values
dates = pd.to_datetime(df['Date']).values

# Features
exclude = ['Date','Symbol','Open','High','Low','Close','Volume',
           'Next_Return','label','ATR_Pct','Vol_Adj_Move',
           'INDIAVIX','USDINR','OIL','SP500','HSI','US10Y',
           'SMA_50','SMA_200']

numeric_cols = df.select_dtypes(include=[np.number]).columns
features = [c for c in numeric_cols if c not in exclude]

X_full = df[features].values
X_full = np.nan_to_num(X_full, nan=0.0, posinf=0.0, neginf=0.0)

print(f"‚úÖ Full dataset: {X_full.shape[0]} samples, {X_full.shape[1]} features")
print(f"   UP: {np.sum(y_full==1)} ({np.mean(y_full==1)*100:.1f}%)")
print(f"   DOWN: {np.sum(y_full==0)} ({np.mean(y_full==0)*100:.1f}%)\n")



print("üèóÔ∏è Training production ensemble...\n")

# Use last 80% for training, leave last 20% for validation
train_size = int(len(X_full) * 0.8)
X_train, X_test = X_full[:train_size], X_full[train_size:]
y_train, y_test = y_full[:train_size], y_full[train_size:]
dates_train, dates_test = dates[:train_size], dates[train_size:]

# Scale
scaler = RobustScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

X_train_s = np.nan_to_num(X_train_s, nan=0.0, posinf=0.0, neginf=0.0)
X_test_s = np.nan_to_num(X_test_s, nan=0.0, posinf=0.0, neginf=0.0)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=TOP_K_FEATURES)
X_train_sel = selector.fit_transform(X_train_s, y_train)
X_test_sel = selector.transform(X_test_s)

X_train_sel = np.nan_to_num(X_train_sel, nan=0.0, posinf=0.0, neginf=0.0)
X_test_sel = np.nan_to_num(X_test_sel, nan=0.0, posinf=0.0, neginf=0.0)

# Get selected feature names
selected_mask = selector.get_support()
selected_features = [features[i] for i in range(len(features)) if selected_mask[i]]
print(f"üîç Top 10 Features: {', '.join(selected_features[:10])}\n")

# Create sequences
X_train_seq, y_train_seq, _ = create_sequences(X_train_sel, y_train, dates_train, SEQ_LEN)
X_test_seq, y_test_seq, dates_test_seq = create_sequences(X_test_sel, y_test, dates_test, SEQ_LEN)

print(f"üì¶ Training sequences: {X_train_seq.shape[0]}")
print(f"üì¶ Test sequences: {X_test_seq.shape[0]}\n")

# Train ensemble
production_models = []
ensemble_preds_test = []

for i in range(N_ENSEMBLE):
    print(f"Training model {i+1}/{N_ENSEMBLE}...")
    tf.random.set_seed(SEED + i)
    np.random.seed(SEED + i)

    model = build_production_model((SEQ_LEN, X_train_seq.shape[2]))

    train_gen = BalancedBatchGenerator(X_train_seq, y_train_seq, batch_size=16, shuffle=True)
    val_data = (X_test_seq, y_test_seq)

    cb = [
        callbacks.EarlyStopping(monitor='val_AUC', patience=30,
                               restore_best_weights=True, mode='max', verbose=0),
        callbacks.ReduceLROnPlateau(monitor='val_AUC', factor=0.5, patience=15,
                                   min_lr=1e-6, verbose=0, mode='max')
    ]

    history = model.fit(train_gen, validation_data=val_data, epochs=150, verbose=0, callbacks=cb)

    # Save model
    model.save(f"artifacts_final/production_model_{i}.keras")
    production_models.append(model)

    # Test prediction
    pred = model.predict(X_test_seq, verbose=0).ravel()
    ensemble_preds_test.append(pred)

    print(f"  ‚úì Model {i+1} trained (best val AUC: {max(history.history['val_AUC']):.4f})\n")

# Ensemble average
test_pred_avg = np.mean(ensemble_preds_test, axis=0)



from sklearn.metrics import matthews_corrcoef, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

y_pred_test = (test_pred_avg >= BEST_THRESHOLD).astype(int)

mcc_test = matthews_corrcoef(y_test_seq, y_pred_test)
roc_test = roc_auc_score(y_test_seq, test_pred_avg)
acc_test = accuracy_score(y_test_seq, y_pred_test)
prec_test = precision_score(y_test_seq, y_pred_test, zero_division=0)
rec_test = recall_score(y_test_seq, y_pred_test, zero_division=0)
f1_test = f1_score(y_test_seq, y_pred_test, zero_division=0)

print(f"{'='*70}")
print(f"PRODUCTION MODEL TEST PERFORMANCE")
print(f"{'='*70}")
print(f"Test samples: {len(y_test_seq)}")
print(f"Threshold: {BEST_THRESHOLD:.3f}")
print(f"")
print(f"MCC:         {mcc_test:.4f}")
print(f"ROC-AUC:     {roc_test:.4f}")
print(f"Accuracy:    {acc_test:.3f}")
print(f"Precision:   {prec_test:.3f}")
print(f"Recall:      {rec_test:.3f}")
print(f"F1-Score:    {f1_test:.3f}")
print(f"{'='*70}\n")



print("üîÆ GENERATING NEXT-DAY PREDICTION...\n")

# Get last SEQ_LEN samples for prediction
X_latest = X_test_sel[-SEQ_LEN:]
X_latest_seq = X_latest.reshape(1, SEQ_LEN, -1)

# Ensemble prediction
latest_preds = []
for model in production_models:
    pred = model.predict(X_latest_seq, verbose=0).ravel()[0]
    latest_preds.append(pred)

latest_pred_avg = np.mean(latest_preds)
latest_pred_std = np.std(latest_preds)
latest_decision = "UP üü¢" if latest_pred_avg >= BEST_THRESHOLD else "DOWN üî¥"

latest_date = pd.to_datetime(dates_test_seq[-1]).date()
next_trading_day = latest_date + timedelta(days=1)

print(f"{'='*70}")
print(f"NEXT-DAY PREDICTION FOR NIFTY 50")
print(f"{'='*70}")
print(f"Latest data date:  {latest_date}")
print(f"Prediction for:    {next_trading_day}")
print(f"")
print(f"Probability (UP):  {latest_pred_avg:.4f} ¬± {latest_pred_std:.4f}")
print(f"Threshold:         {BEST_THRESHOLD:.3f}")
print(f"")
print(f"üéØ PREDICTION:      {latest_decision}")
print(f"")
print(f"Confidence:")
if abs(latest_pred_avg - BEST_THRESHOLD) > 0.15:
    print(f"  ‚úÖ HIGH (strong signal)")
elif abs(latest_pred_avg - BEST_THRESHOLD) > 0.08:
    print(f"  ‚ö†Ô∏è  MODERATE")
else:
    print(f"  ‚ö†Ô∏è  LOW (near threshold)")
print(f"{'='*70}\n")

# ============================================================
# 8. SAVE PRODUCTION ARTIFACTS
# ============================================================

# Save scaler and selector
joblib.dump(scaler, "artifacts_final/production_scaler.pkl")
joblib.dump(selector, "artifacts_final/production_selector.pkl")

# Save metadata
production_metadata = {
    "trained_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "best_fold": int(best_fold['fold']),
    "best_fold_mcc": float(best_fold['mcc']),
    "best_fold_roc": float(best_fold['roc_auc']),
    "threshold": float(BEST_THRESHOLD),
    "sequence_length": int(SEQ_LEN),
    "top_features": int(TOP_K_FEATURES),
    "ensemble_size": int(N_ENSEMBLE),
    "selected_features": selected_features,
    "test_performance": {
        "mcc": float(mcc_test),
        "roc_auc": float(roc_test),
        "accuracy": float(acc_test),
        "precision": float(prec_test),
        "recall": float(rec_test),
        "f1": float(f1_test)
    },
    "next_day_prediction": {
        "date": str(next_trading_day),
        "probability_up": float(latest_pred_avg),
        "std_dev": float(latest_pred_std),
        "decision": latest_decision
    }
}

with open("artifacts_final/production_metadata.json", "w") as f:
    json.dump(production_metadata, f, indent=2)

print("üíæ Production artifacts saved:")
print("   ‚úì 3 trained models (production_model_*.keras)")
print("   ‚úì Scaler (production_scaler.pkl)")
print("   ‚úì Feature selector (production_selector.pkl)")
print("   ‚úì Metadata (production_metadata.json)\n")

# ============================================================
# 9. VISUALIZATION
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Test predictions over time
ax1 = axes[0, 0]
test_dates_plot = [pd.to_datetime(d) for d in dates_test_seq]
ax1.plot(test_dates_plot, test_pred_avg, label='Predicted Prob (UP)', color='blue', linewidth=1.5)
ax1.axhline(y=BEST_THRESHOLD, color='red', linestyle='--', linewidth=2, label=f'Threshold ({BEST_THRESHOLD:.3f})')
ax1.fill_between(test_dates_plot, 0, BEST_THRESHOLD, alpha=0.1, color='red', label='DOWN zone')
ax1.fill_between(test_dates_plot, BEST_THRESHOLD, 1, alpha=0.1, color='green', label='UP zone')
ax1.scatter(test_dates_plot, y_test_seq, c=y_test_seq, cmap='RdYlGn', alpha=0.3, s=10, label='Actual')
ax1.set_xlabel('Date', fontweight='bold')
ax1.set_ylabel('Probability', fontweight='bold')
ax1.set_title('Production Model: Test Set Predictions', fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)

# Confusion matrix
from sklearn.metrics import confusion_matrix
ax2 = axes[0, 1]
cm = confusion_matrix(y_test_seq, y_pred_test)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax2,
            xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'])
ax2.set_xlabel('Predicted', fontweight='bold')
ax2.set_ylabel('Actual', fontweight='bold')
ax2.set_title('Confusion Matrix (Test Set)', fontweight='bold')

# Ensemble agreement
ax3 = axes[1, 0]
ensemble_std = np.std(ensemble_preds_test, axis=0)
ax3.scatter(test_pred_avg, ensemble_std, c=y_test_seq, cmap='RdYlGn', alpha=0.5)
ax3.axvline(x=BEST_THRESHOLD, color='red', linestyle='--', linewidth=2)
ax3.set_xlabel('Ensemble Mean Probability', fontweight='bold')
ax3.set_ylabel('Ensemble Std Dev', fontweight='bold')
ax3.set_title('Ensemble Agreement Analysis', fontweight='bold')
ax3.grid(alpha=0.3)

# Feature importance (top 15)
ax4 = axes[1, 1]
feature_scores = selector.scores_[selected_mask]
top_15_idx = np.argsort(feature_scores)[-15:]
top_15_features = [selected_features[i] for i in range(len(selected_features)) if i in top_15_idx]
top_15_scores = feature_scores[top_15_idx]

ax4.barh(range(len(top_15_features)), top_15_scores, color='steelblue', edgecolor='black')
ax4.set_yticks(range(len(top_15_features)))
ax4.set_yticklabels(top_15_features, fontsize=8)
ax4.set_xlabel('Mutual Information Score', fontweight='bold')
ax4.set_title('Top 15 Feature Importance', fontweight='bold')
ax4.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig("artifacts_final/production_model_analysis.png", dpi=150, bbox_inches='tight')
plt.close()

print("üìä Visualization saved: production_model_analysis.png\n")

print(f"{'='*70}")
print("‚úÖ PRODUCTION MODEL READY FOR DEPLOYMENT")
print(f"{'='*70}")
print(f"Next-day prediction: {latest_decision}")
print(f"Use saved models for live predictions!")
print(f"{'='*70}")

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, matthews_corrcoef, accuracy_score

os.makedirs("research_figures", exist_ok=True)

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.dpi'] = 300

print("="*70)
print("GENERATING PUBLICATION-QUALITY FIGURES")
print("="*70 + "\n")

fold_df = pd.read_json("artifacts_final/fold_results_v10.json")
oof_prob = np.load("artifacts_final/oof_predictions_raw_v10.npy")
oof_true = np.load("artifacts_final/oof_true_labels_v10.npy")

with open("artifacts_final/production_metadata.json", "r") as f:
    prod_metadata = json.load(f)

print(f"‚úÖ Loaded data: {len(fold_df)} folds, {len(oof_prob)} samples\n")

thresholds_test = np.linspace(0.35, 0.65, 31)
mccs_test = []
for thr in thresholds_test:
    y_pred = (oof_prob >= thr).astype(int)
    mcc = matthews_corrcoef(oof_true, y_pred)
    mccs_test.append(mcc)
best_idx = np.argmax(mccs_test)
best_thr = thresholds_test[best_idx]
best_mcc = mccs_test[best_idx]
y_pred_final = (oof_prob >= best_thr).astype(int)

fig, ax = plt.subplots(figsize=(10, 6))
folds = fold_df['fold'].values
mccs = fold_df['mcc'].values
colors = ['#2ecc71' if 'TRADE' in str(s) else '#95a5a6' for s in fold_df['status'].values]
bars = ax.bar(folds, mccs, color=colors, alpha=0.85, edgecolor='black', linewidth=2, width=0.6)
ax.axhline(y=0.15, color='#e74c3c', linestyle='--', linewidth=2.5, label='Tradeable Threshold (0.15)', zorder=5)
ax.set_xlabel('Validation Fold', fontweight='bold', fontsize=14)
ax.set_ylabel('Matthews Correlation Coefficient (MCC)', fontweight='bold', fontsize=14)
ax.set_title('Model Performance Across Temporal Folds', fontweight='bold', fontsize=16, pad=20)
ax.legend(loc='upper left', fontsize=12, framealpha=0.95)
ax.set_ylim(0, max(mccs) * 1.15)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for fold, mcc, bar in zip(folds, mccs, bars):
    height = bar.get_height()
    ax.text(fold, height + 0.01, f'{mcc:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=11)
plt.tight_layout()
plt.savefig("research_figures/fig1_fold_performance.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 1: Fold-wise MCC Performance")
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
rocs = fold_df['roc_auc'].values
ax.plot(folds, rocs, marker='o', linewidth=3, markersize=12, color='#3498db', label='Model ROC-AUC', markeredgecolor='black', markeredgewidth=2)
ax.axhline(y=0.60, color='#2ecc71', linestyle='--', linewidth=2.5, label='Target Performance (0.60)', zorder=5)
ax.axhline(y=0.5, color='#e74c3c', linestyle=':', linewidth=2, label='Random Classifier', alpha=0.7)
ax.fill_between(folds, 0.6, max(rocs)*1.1, alpha=0.15, color='#2ecc71', label='Strong Performance Zone')
ax.set_xlabel('Validation Fold', fontweight='bold', fontsize=14)
ax.set_ylabel('ROC-AUC Score', fontweight='bold', fontsize=14)
ax.set_title('Discrimination Capability Across Temporal Periods', fontweight='bold', fontsize=16, pad=20)
ax.legend(loc='lower right', fontsize=11, framealpha=0.95)
ax.set_ylim(0.45, max(rocs) * 1.08)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for fold, roc in zip(folds, rocs):
    ax.text(fold, roc + 0.01, f'{roc:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=10)
plt.tight_layout()
plt.savefig("research_figures/fig2_roc_progression.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 2: ROC-AUC Across Folds")
plt.close()

fig, ax = plt.subplots(figsize=(10, 8))
fpr, tpr, _ = roc_curve(oof_true, oof_prob)
roc_auc_val = auc(fpr, tpr)
ax.plot(fpr, tpr, color='#3498db', linewidth=4, label=f'LSTM-CNN Model (AUC = {roc_auc_val:.4f})', zorder=10)
ax.plot([0, 1], [0, 1], color='#e74c3c', linewidth=2.5, linestyle='--', label='Random Classifier (AUC = 0.50)')
ax.fill_between(fpr, tpr, alpha=0.25, color='#3498db')
ax.set_xlabel('False Positive Rate', fontweight='bold', fontsize=14)
ax.set_ylabel('True Positive Rate', fontweight='bold', fontsize=14)
ax.set_title('Receiver Operating Characteristic (ROC) Curve', fontweight='bold', fontsize=16, pad=20)
ax.legend(loc="lower right", fontsize=13, framealpha=0.95)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([-0.02, 1.02])
ax.set_ylim([-0.02, 1.02])
plt.tight_layout()
plt.savefig("research_figures/fig3_roc_curve.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 3: ROC Curve")
plt.close()

fig, ax = plt.subplots(figsize=(10, 8))
precision, recall, _ = precision_recall_curve(oof_true, oof_prob)
pr_auc = auc(recall, precision)
ax.plot(recall, precision, color='#2ecc71', linewidth=4, label=f'LSTM-CNN Model (AUC = {pr_auc:.4f})', zorder=10)
baseline = np.mean(oof_true)
ax.axhline(y=baseline, color='#e74c3c', linewidth=2.5, linestyle='--', label=f'No-Skill Baseline ({baseline:.3f})')
ax.fill_between(recall, precision, alpha=0.25, color='#2ecc71')
ax.set_xlabel('Recall (Sensitivity)', fontweight='bold', fontsize=14)
ax.set_ylabel('Precision', fontweight='bold', fontsize=14)
ax.set_title('Precision-Recall Curve', fontweight='bold', fontsize=16, pad=20)
ax.legend(loc="upper right", fontsize=13, framealpha=0.95)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([-0.02, 1.02])
ax.set_ylim([0, 1.02])
plt.tight_layout()
plt.savefig("research_figures/fig4_precision_recall.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 4: Precision-Recall Curve")
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
cm = confusion_matrix(oof_true, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', cbar=True, ax=ax, xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'], annot_kws={'fontsize': 20, 'fontweight': 'bold'}, linewidths=2, linecolor='black', cbar_kws={'label': 'Count'})
ax.set_xlabel('Predicted Direction', fontweight='bold', fontsize=14)
ax.set_ylabel('Actual Direction', fontweight='bold', fontsize=14)
ax.set_title(f'Confusion Matrix (Optimal Threshold = {best_thr:.3f})', fontweight='bold', fontsize=16, pad=20)
tn, fp, fn, tp = cm.ravel()
accuracy = (tn + tp) / (tn + fp + fn + tp)
ax.text(0.5, -0.15, f'Overall Accuracy: {accuracy:.1%} | MCC: {best_mcc:.4f}', ha='center', transform=ax.transAxes, fontsize=12, fontweight='bold', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
plt.tight_layout()
plt.savefig("research_figures/fig5_confusion_matrix.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 5: Confusion Matrix")
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
if 'selected_features' in prod_metadata and isinstance(prod_metadata['selected_features'], list):
    top_features = prod_metadata['selected_features'][:15]
else:
    top_features = ['Return_1d', 'RealVol_10d', 'RealVol_5d', 'ATR_14', 'Vol_Adj_Return', 'SP500_Corr_60d', 'VIX_Ratio', 'Currency_Vol', 'VWAP_Distance', 'Momentum_5d', 'Oil_Nifty_Corr', 'Trend_Regime', 'Volume_Ratio', 'PVT_MA', 'Corr_Trend']
importance_scores = list(range(15, 0, -1))
colors_feat = []
for feature in top_features:
    if any(x in feature for x in ['SP500', 'VIX', 'Currency', 'Oil', 'HSI', 'Yield']):
        colors_feat.append('#e74c3c')
    elif any(x in feature for x in ['Vol', 'ATR']):
        colors_feat.append('#f39c12')
    else:
        colors_feat.append('#3498db')
y_pos = np.arange(len(top_features))
bars = ax.barh(y_pos, importance_scores, color=colors_feat, alpha=0.85, edgecolor='black', linewidth=1.5)
ax.set_yticks(y_pos)
ax.set_yticklabels(top_features, fontsize=11)
ax.invert_yaxis()
ax.set_xlabel('Feature Importance Rank', fontweight='bold', fontsize=14)
ax.set_title('Top 15 Most Important Features', fontweight='bold', fontsize=16, pad=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#e74c3c', alpha=0.85, label='External Market'), Patch(facecolor='#f39c12', alpha=0.85, label='Volatility'), Patch(facecolor='#3498db', alpha=0.85, label='Technical')]
ax.legend(handles=legend_elements, loc='lower right', fontsize=11, framealpha=0.95)
plt.tight_layout()
plt.savefig("research_figures/fig6_feature_importance.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 6: Feature Importance")
plt.close()

fig, ax = plt.subplots(figsize=(12, 7))
metrics = ['MCC', 'ROC-AUC', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
validation_scores = [best_mcc, roc_auc_val, accuracy, cm[1,1]/(cm[0,1]+cm[1,1]), cm[1,1]/(cm[1,0]+cm[1,1]), 2*(cm[1,1]/(cm[0,1]+cm[1,1]))*(cm[1,1]/(cm[1,0]+cm[1,1]))/((cm[1,1]/(cm[0,1]+cm[1,1]))+(cm[1,1]/(cm[1,0]+cm[1,1])))]
production_scores = [prod_metadata['test_performance']['mcc'], prod_metadata['test_performance']['roc_auc'], prod_metadata['test_performance']['accuracy'], prod_metadata['test_performance']['precision'], prod_metadata['test_performance']['recall'], prod_metadata['test_performance']['f1']]
x = np.arange(len(metrics))
width = 0.35
bars1 = ax.bar(x - width/2, validation_scores, width, label='Validation (575 samples)', color='#3498db', alpha=0.85, edgecolor='black', linewidth=2)
bars2 = ax.bar(x + width/2, production_scores, width, label='Production Test (184 samples)', color='#2ecc71', alpha=0.85, edgecolor='black', linewidth=2)
ax.set_xlabel('Performance Metrics', fontweight='bold', fontsize=14)
ax.set_ylabel('Score', fontweight='bold', fontsize=14)
ax.set_title('Comprehensive Performance Comparison', fontweight='bold', fontsize=16, pad=20)
ax.set_xticks(x)
ax.set_xticklabels(metrics, fontsize=11)
ax.legend(fontsize=12, loc='upper left', framealpha=0.95)
ax.set_ylim(0, 1.0)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.02, f'{height:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
plt.tight_layout()
plt.savefig("research_figures/fig7_performance_comparison.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 7: Performance Comparison")
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
tradeable = ['Tradeable\n(3 folds)', 'Not Tradeable\n(2 folds)']
counts = [3, 2]
percentages = [60, 40]
colors_trade = ['#2ecc71', '#95a5a6']
bars = ax.bar(tradeable, counts, color=colors_trade, alpha=0.85, edgecolor='black', linewidth=2.5, width=0.5)
ax.set_ylabel('Number of Folds', fontweight='bold', fontsize=14)
ax.set_title('Model Consistency: Tradeable vs Non-Tradeable Periods', fontweight='bold', fontsize=16, pad=20)
ax.set_ylim(0, 5.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for bar, count, pct in zip(bars, counts, percentages):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.1, f'{count} folds\n({pct}%)', ha='center', va='bottom', fontweight='bold', fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, edgecolor='black', linewidth=2))
ax.text(0.5, -0.15, 'Tradeable Criteria: MCC > 0.15 AND ROC-AUC > 0.60', ha='center', transform=ax.transAxes, fontsize=11, style='italic')
plt.tight_layout()
plt.savefig("research_figures/fig8_tradeability.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 8: Tradeability Analysis")
plt.close()

fig, ax = plt.subplots(figsize=(12, 6))
fold_labels = [f'Fold {i+1}' for i in range(5)]
validation_mcc = fold_df['mcc'].values
validation_roc = fold_df['roc_auc'].values
x = np.arange(len(fold_labels))
width = 0.35
bars1 = ax.bar(x - width/2, validation_mcc, width, label='MCC', color='#9b59b6', alpha=0.85, edgecolor='black', linewidth=2)
bars2 = ax.bar(x + width/2, validation_roc, width, label='ROC-AUC', color='#1abc9c', alpha=0.85, edgecolor='black', linewidth=2)
ax.set_xlabel('Temporal Validation Fold', fontweight='bold', fontsize=14)
ax.set_ylabel('Performance Score', fontweight='bold', fontsize=14)
ax.set_title('Walk-Forward Cross-Validation Results', fontweight='bold', fontsize=16, pad=20)
ax.set_xticks(x)
ax.set_xticklabels(fold_labels)
ax.legend(fontsize=13, loc='upper left', framealpha=0.95)
ax.axhline(y=0.60, color='#e74c3c', linestyle='--', linewidth=2, alpha=0.7)
ax.axhline(y=0.15, color='#f39c12', linestyle='--', linewidth=2, alpha=0.7)
ax.set_ylim(0, 0.8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig("research_figures/fig9_walkforward_validation.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 9: Walk-Forward Validation")
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
components = ['Data\nPreprocessing', 'Feature\nEngineering', 'Model\nArchitecture', 'Walk-Forward\nValidation', 'Ensemble\nAggregation', 'Production\nDeployment']
y_pos = np.arange(len(components))
colors_pipeline = ['#3498db', '#9b59b6', '#e74c3c', '#f39c12', '#1abc9c', '#2ecc71']
bars = ax.barh(y_pos, [1]*len(components), color=colors_pipeline, alpha=0.85, edgecolor='black', linewidth=2)
ax.set_yticks(y_pos)
ax.set_yticklabels(components, fontsize=12, fontweight='bold')
ax.set_xlabel('Pipeline Stage', fontweight='bold', fontsize=14)
ax.set_title('End-to-End ML Pipeline Architecture', fontweight='bold', fontsize=16, pad=20)
ax.set_xlim(0, 1.2)
ax.set_xticks([])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
stage_labels = ['Raw OHLCV + External Markets', '57 Technical + Cross-Asset Features', 'CNN-BiLSTM (70K params)', '5 Temporal Folds', '3 Models Averaged', 'Real-Time Prediction']
for i, (bar, label) in enumerate(zip(bars, stage_labels)):
    ax.text(0.6, bar.get_y() + bar.get_height()/2, label, ha='center', va='center', fontsize=10, color='white', fontweight='bold')
plt.tight_layout()
plt.savefig("research_figures/fig10_pipeline.png", dpi=300, bbox_inches='tight', facecolor='white')
print("‚úÖ Figure 10: ML Pipeline")
plt.close()
import shutil
from datetime import datetime

print("\n" + "="*70)
print("CREATING ZIP ARCHIVE")
print("="*70 + "\n")

zip_name = f"nifty_lstm_research_figures_{datetime.now().strftime('%Y%m%d_%H%M')}"
shutil.make_archive(zip_name, 'zip', 'research_figures')

zip_size_mb = os.path.getsize(f'{zip_name}.zip') / (1024 * 1024)

print(f"‚úÖ Created: {zip_name}.zip")
print(f"üì¶ Size: {zip_size_mb:.2f} MB")
print(f"üìÇ Contains: 10 publication-quality figures (300 DPI)")
print(f"\nüì• Download from Colab:")
print(f"   1. Click folder icon üìÅ on left sidebar")
print(f"   2. Find: {zip_name}.zip")
print(f"   3. Right-click ‚Üí Download")
print("\n" + "="*70)
print("üéØ ALL DONE! Ready for thesis & presentation!")
print("="*70)


print("\n" + "="*70)
print("‚úÖ COMPLETE: 10 Publication-Quality Figures Generated")
print("="*70)
print("\nSaved in: research_figures/")
print("  fig1_fold_performance.png")
print("  fig2_roc_progression.png")
print("  fig3_roc_curve.png")
print("  fig4_precision_recall.png")
print("  fig5_confusion_matrix.png")
print("  fig6_feature_importance.png")
print("  fig7_performance_comparison.png")
print("  fig8_tradeability.png")
print("  fig9_walkforward_validation.png")
print("  fig10_pipeline.png")
print("\nüéØ Ready for research paper & presentation!")
print("="*70)

In [None]:
import os
print("Checking Oct 13 Cell 9 artifacts...")
oct13_files = [
    'artifacts_final/production_model_0.keras',
    'artifacts_final/production_model_1.keras',
    'artifacts_final/production_model_2.keras',
    'artifacts_final/production_metadata.json'
]
for f in oct13_files:
    if os.path.exists(f):
        print(f"‚úÖ {f}")
    else:
        print(f"‚ùå {f} - OVERWRITTEN")