In [1]:
!pip install prophet

Collecting prophet
  Downloading prophet-1.1.6-py3-none-win_amd64.whl.metadata (3.6 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Downloading cmdstanpy-1.2.5-py3-none-any.whl.metadata (4.0 kB)
Collecting holidays<1,>=0.25 (from prophet)
  Downloading holidays-0.71-py3-none-any.whl.metadata (34 kB)
Collecting tqdm>=4.36.1 (from prophet)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting importlib-resources (from prophet)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy>=1.0.4->prophet)
  Downloading stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Downloading prophet-1.1.6-py3-none-win_amd64.whl (13.3 MB)
   ---------------------------------------- 0.0/13.3 MB ? eta -:--:--
   -------- ------------------------------- 2.9/13.3 MB 15.2 MB/s eta 0:00:01
   ------------------ --------------------- 6.0/13.3 MB 15.4 MB/s eta 0:00:01
   ----------------------------- ---------- 9.7/13.3 MB 15.5 MB/s 

In [1]:
!pip install pandas_market_calendars

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-5.1.0-py3-none-any.whl.metadata (9.6 kB)
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.10-py3-none-any.whl.metadata (37 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting toolz (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting korean_lunar_calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading pandas_market_calendars-5.1.0-py3-none-any.whl (123 kB)
Downloading exchange_calendars-4.10-py3-none-any.whl (198 kB)
Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl (9.0 kB)
Downloading pyluach-2.2.0-py3-none-any.whl (25 kB)
Downloading toolz-1.0.0-py3-none-any.whl (56 kB)
Installing col

In [23]:
import pandas as pd
import numpy as np
import warnings
import logging
import tensorflow as tf # Use Tensorflow Keras
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm # Or from tqdm import tqdm for standard scripts

# Make sure to install necessary libraries:
# pip install tensorflow pandas numpy scikit-learn tqdm pandas_market_calendars
try:
    import pandas_market_calendars as mcal
except ImportError:
    print("Error: Please install pandas_market_calendars")
    print("Example: pip install pandas_market_calendars")
    exit()

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
tf.get_logger().setLevel(logging.ERROR) # Suppress TensorFlow logs

# --- Configuration ---
N_STEPS_IN = 10       # Lookback window size (business days) - Hyperparameter
N_FORECAST_BDAYS = 15 # Number of business days to predict
EPOCHS_EXOG = 50      # Epochs for Stage 1 LSTM (adjust based on convergence)
BATCH_SIZE_EXOG = 32
EPOCHS_TARGET = 50    # Epochs for Stage 2 LSTM (adjust based on convergence)
BATCH_SIZE_TARGET = 32
LSTM_UNITS = 50       # Number of units in LSTM layers - Hyperparameter
DROPOUT_RATE = 0.2    # Dropout rate for regularization

# --- 1. Load Data ---
try:
    df_orig = pd.read_csv("UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv")
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv not found.")
    exit()
except Exception as e:
    print(f"An error occurred during loading: {e}")
    exit()

# --- 2. Preprocessing ---
df = df_orig.copy()
try:
    df['All_Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
    df.sort_values('All_Date', inplace=True)

    # Define cleaning functions (same as before)
    def parse_volume(value):
        if isinstance(value, (int, float)): return value
        if isinstance(value, str):
            value = value.replace(',', '').strip().upper()
            if 'K' in value: return float(value.replace('K', '')) * 1000
            if 'M' in value: return float(value.replace('M', '')) * 1000000
        try: return float(value)
        except (ValueError, TypeError): return np.nan
    def parse_percentage(value):
        if isinstance(value, (int, float)): return value
        if isinstance(value, str): value = value.replace('%', '').strip()
        try: return float(value)
        except (ValueError, TypeError): return np.nan

    cols_to_clean = {
        'ETF_Vol.': parse_volume, 'ETF_Change %': parse_percentage, 'USD_Change %': parse_percentage
    }
    for col, func in cols_to_clean.items():
        if col in df.columns: df[col] = df[col].apply(func)

    for col in df.columns:
        if col != 'All_Date': df[col] = pd.to_numeric(df[col], errors='coerce')

    # Set index and resample to Business Days *before* scaling/sequencing
    df_proc = df.set_index('All_Date').copy()
    df_proc = df_proc.resample('B').last() # Use business day frequency
    print(f"Resampled data to business days. Shape before NaN handling: {df_proc.shape}")

    # Handle NaNs after resampling
    cols_to_fill = df_proc.columns
    df_proc[cols_to_fill] = df_proc[cols_to_fill].ffill()
    if df_proc[cols_to_fill].isnull().sum().sum() > 0:
        df_proc[cols_to_fill] = df_proc[cols_to_fill].bfill()

    if df_proc.isnull().sum().sum() > 0:
        print("Warning: NaNs remain after ffill/bfill. Dropping rows...")
        df_proc.dropna(inplace=True)
        if df_proc.isnull().sum().sum() > 0: raise ValueError("NaNs persist after cleaning")

    print(f"Shape after NaN handling: {df_proc.shape}")
    if len(df_proc) < N_STEPS_IN * 2: # Basic check for enough data
         raise ValueError(f"Insufficient data ({len(df_proc)} days) for lookback window ({N_STEPS_IN}) and training.")

    # Define target and exogenous columns
    target_col = 'MF_NAV'
    all_exog_cols = [col for col in df_proc.columns if col != target_col]
    all_cols_ordered = [target_col] + all_exog_cols # Consistent order
    df_proc = df_proc[all_cols_ordered] # Reorder dataframe columns

    print("Preprocessing complete.")

except Exception as e:
    print(f"An error occurred during preprocessing: {e}")
    exit()


# --- 3. Scaling ---
# Scale ALL columns together initially, then separate scalers if needed
# Or scale column by column
scalers = {}
scaled_data = pd.DataFrame(index=df_proc.index)

print("Scaling data...")
for col in df_proc.columns:
    scalers[col] = MinMaxScaler(feature_range=(0, 1))
    scaled_col = scalers[col].fit_transform(df_proc[[col]])
    scaled_data[col] = scaled_col.flatten()

print("Data scaling complete.")
scaled_exog_data = scaled_data[all_exog_cols]
scaled_target_data = scaled_data[[target_col]] # Keep as DataFrame

# --- 4. Data Structuring for LSTM ---
def create_sequences(input_data, n_steps_in, n_steps_out=1):
    """Creates sequences for LSTM."""
    X, y = [], []
    for i in range(len(input_data) - n_steps_in - n_steps_out + 1):
        seq_in = input_data[i : i + n_steps_in]
        seq_out = input_data[i + n_steps_in : i + n_steps_in + n_steps_out]
        X.append(seq_in)
        y.append(seq_out)
    return np.array(X), np.array(y)

# --- 5. Stage 1: Predict Future Exogenous Variables ---
print("\n--- Stage 1: Predicting Future Exogenous Variables (Multivariate LSTM) ---")
n_exog_features = len(all_exog_cols)

# Structure data for Exogenous prediction LSTM
# Input: window of past exog variables; Output: next step's exog variables
X_exog, y_exog = create_sequences(scaled_exog_data.values, N_STEPS_IN, n_steps_out=1)
y_exog = y_exog.reshape(y_exog.shape[0], n_exog_features) # Reshape y for Dense layer output

print(f"Exogenous data shapes: X={X_exog.shape}, y={y_exog.shape}")

# Build Stage 1 LSTM Model
model_exog = Sequential([
    LSTM(LSTM_UNITS, activation='relu', input_shape=(N_STEPS_IN, n_exog_features), return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_UNITS, activation='relu', return_sequences=False),
    Dropout(DROPOUT_RATE),
    Dense(n_exog_features) # Output layer for all exogenous features
])
model_exog.compile(optimizer='adam', loss='mse')
# model_exog.summary() # Optional: print model structure

# Train Stage 1 Model
print(f"Training Stage 1 LSTM for {EPOCHS_EXOG} epochs...")
# Consider adding validation_split=0.1 or 0.2 if data allows
# Or use EarlyStopping callback
history_exog = model_exog.fit(X_exog, y_exog, epochs=EPOCHS_EXOG, batch_size=BATCH_SIZE_EXOG, verbose=0) # Verbose=0 for less output
print("Stage 1 Training complete.")

# --- Iterative Forecasting for Exogenous Variables ---
print(f"Generating {N_FORECAST_BDAYS} steps of future exogenous predictions...")
last_sequence_exog = scaled_exog_data.values[-N_STEPS_IN:] # Last known historical sequence (scaled)
current_batch_exog = last_sequence_exog.reshape((1, N_STEPS_IN, n_exog_features))
future_exog_preds_scaled = []

for i in tqdm(range(N_FORECAST_BDAYS), desc="Exog Forecast Steps"):
    # Get prediction for the next step
    current_pred_exog = model_exog.predict(current_batch_exog, verbose=0)[0]
    future_exog_preds_scaled.append(current_pred_exog)
    # Update the batch for the next prediction: remove first step, append prediction
    new_batch_entry = current_pred_exog.reshape((1, 1, n_exog_features))
    current_batch_exog = np.append(current_batch_exog[:, 1:, :], new_batch_entry, axis=1)

future_exog_preds_scaled = np.array(future_exog_preds_scaled)

# --- Inverse Transform Exogenous Predictions ---
print("Inverse transforming exogenous predictions...")
future_exog_preds_inv = pd.DataFrame(index=range(N_FORECAST_BDAYS), columns=all_exog_cols)
for i, col in enumerate(all_exog_cols):
    col_preds_scaled = future_exog_preds_scaled[:, i].reshape(-1, 1)
    future_exog_preds_inv[col] = scalers[col].inverse_transform(col_preds_scaled).flatten()

# Get the business day dates for the forecast period
last_hist_date = df_proc.index.max()
forecast_start_date = last_hist_date + pd.Timedelta(days=1) # Start day after last known date
# Estimate end date needed to capture enough business days (add buffer)
estimated_end_date = forecast_start_date + pd.Timedelta(days=N_FORECAST_BDAYS + 7) # Add buffer for weekends/holidays
forecast_bdates = get_business_days(forecast_start_date, estimated_end_date)
forecast_bdates = forecast_bdates[:N_FORECAST_BDAYS] # Select the exact number needed

if len(forecast_bdates) != N_FORECAST_BDAYS:
     print(f"Warning: Could only generate {len(forecast_bdates)} business days. Adjusting N_FORECAST_BDAYS.")
     N_FORECAST_BDAYS = len(forecast_bdates)
     # Trim predictions if necessary (shouldn't be if loop ran N_FORECAST_BDAYS times)
     future_exog_preds_inv = future_exog_preds_inv.iloc[:N_FORECAST_BDAYS]


future_exog_predictions_df = future_exog_preds_inv.set_index(forecast_bdates)
print("Future exogenous predictions generated and inverse transformed.")


# --- 6. Stage 2: Predict Future MF_NAV ---
print("\n--- Stage 2: Predicting Future MF_NAV (LSTM with Exog) ---")
n_features_target = 1 + n_exog_features # Target + All Exog

# Structure data for MF_NAV prediction LSTM
# Input: window of past MF_NAV + past Exog; Output: next step's MF_NAV
X_target, y_target = create_sequences(scaled_data[all_cols_ordered].values, N_STEPS_IN, n_steps_out=1)

# --- FIX APPLIED HERE ---
# Correctly select the target feature (MF_NAV at index 0) for the single output step
y_target = y_target[:, 0, 0].reshape(-1, 1)
# ------------------------

print(f"Target data shapes after structuring: X={X_target.shape}, y={y_target.shape}") # Verify shapes

# Build Stage 2 LSTM Model
model_target = Sequential([
    LSTM(LSTM_UNITS, activation='relu', input_shape=(N_STEPS_IN, n_features_target), return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_UNITS, activation='relu', return_sequences=False),
    Dropout(DROPOUT_RATE),
    Dense(1) # Single output neuron for MF_NAV
])
model_target.compile(optimizer='adam', loss='mse')
# model_target.summary() # Optional

# Train Stage 2 Model
print(f"Training Stage 2 LSTM for {EPOCHS_TARGET} epochs...")
# Add validation_split for better monitoring during training
history_target = model_target.fit(X_target, y_target, epochs=EPOCHS_TARGET, batch_size=BATCH_SIZE_TARGET, validation_split=0.1, verbose=0) # Added validation_split
print("Stage 2 Training complete.")

# ... (rest of the code for iterative forecasting and output remains the same) ...

# --- Iterative Forecasting for MF_NAV ---
print(f"Generating {N_FORECAST_BDAYS} steps of future MF_NAV predictions...")

# Get the last historical sequence (including target and exog, scaled)
last_sequence_target = scaled_data[all_cols_ordered].values[-N_STEPS_IN:]
current_batch_target = last_sequence_target.reshape((1, N_STEPS_IN, n_features_target))
future_target_preds_scaled = []

# Get the SCALED future exogenous predictions (needed for input)
future_exog_preds_scaled_array = scaled_data[all_exog_cols].iloc[-N_FORECAST_BDAYS:].values # Re-use from stage 1 prediction
# Alternative: re-predict scaled exog if preferred
# Convert the inverse-transformed predictions back to scaled for input
scaled_future_exog_for_input = pd.DataFrame(index=future_exog_predictions_df.index)
for col in all_exog_cols:
     scaled_future_exog_for_input[col] = scalers[col].transform(future_exog_predictions_df[[col]]).flatten()


for i in tqdm(range(N_FORECAST_BDAYS), desc="Target Forecast Steps"):
    # Predict the next MF_NAV step
    current_pred_target_scaled = model_target.predict(current_batch_target, verbose=0)[0]
    future_target_preds_scaled.append(current_pred_target_scaled)

    # Prepare the next input sequence
    # Get the predicted exogenous variables for the *next* time step (i+1)
    # Need to handle index carefully if i goes out of bounds
    if i < N_FORECAST_BDAYS -1:
        next_exog_scaled = scaled_future_exog_for_input.iloc[i+1].values # Use the exog prediction for the step we are about to predict NAV for
    else:
        # For the very last prediction, we don't have exog for i+1, reuse last predicted exog
        # Or ideally, predict one more exog step in Stage 1
        next_exog_scaled = scaled_future_exog_for_input.iloc[i].values # Simple fallback


    # Create the new entry for the sequence: [predicted_NAV, next_predicted_Exog1, next_predicted_Exog2,...]
    new_sequence_entry = np.concatenate([current_pred_target_scaled, next_exog_scaled]).reshape((1, 1, n_features_target))

    # Update the batch: remove first step, append new entry
    current_batch_target = np.append(current_batch_target[:, 1:, :], new_sequence_entry, axis=1)


future_target_preds_scaled = np.array(future_target_preds_scaled)

# --- Inverse Transform Target Predictions ---
print("Inverse transforming target (MF_NAV) predictions...")
mf_nav_final_forecast_inv = scalers[target_col].inverse_transform(future_target_preds_scaled)

# --- 7. Display Final Forecast ---
final_forecast_df = pd.DataFrame({
    'Predicted_MF_NAV': mf_nav_final_forecast_inv.flatten()
    }, index=forecast_bdates) # Use the calculated business day dates

print("\n--- FINAL Predicted MF_NAV using Multivariate LSTM Approach ---")
print(f"--- Forecast for Business Days: {forecast_bdates.min().date()} to {forecast_bdates.max().date()} ---")
pd.options.display.float_format = '{:.4f}'.format
print(final_forecast_df)

print("\n--- LSTM MODEL WARNINGS ---")
print(f"1. LSTM results are highly sensitive to lookback ({N_STEPS_IN}), units ({LSTM_UNITS}), epochs, batch size, and other hyperparameters.")
print("2. The model was trained on a relatively small dataset for LSTMs; overfitting is a risk.")
print("3. Accuracy heavily depends on the Stage 1 prediction of ALL exogenous variables.")
print("4. No extensive hyperparameter tuning or rigorous backtesting was performed.")
print("5. Evaluate these results critically.")

Data loaded successfully.
Resampled data to business days. Shape before NaN handling: (522, 17)
Shape after NaN handling: (522, 17)
Preprocessing complete.
Scaling data...
Data scaling complete.

--- Stage 1: Predicting Future Exogenous Variables (Multivariate LSTM) ---
Exogenous data shapes: X=(512, 10, 16), y=(512, 16)
Training Stage 1 LSTM for 50 epochs...
Stage 1 Training complete.
Generating 15 steps of future exogenous predictions...


Exog Forecast Steps:   0%|          | 0/15 [00:00<?, ?it/s]

Inverse transforming exogenous predictions...
Future exogenous predictions generated and inverse transformed.

--- Stage 2: Predicting Future MF_NAV (LSTM with Exog) ---
Target data shapes after structuring: X=(512, 10, 17), y=(512, 1)
Training Stage 2 LSTM for 50 epochs...
Stage 2 Training complete.
Generating 14 steps of future MF_NAV predictions...


Target Forecast Steps:   0%|          | 0/14 [00:00<?, ?it/s]

Inverse transforming target (MF_NAV) predictions...

--- FINAL Predicted MF_NAV using Multivariate LSTM Approach ---
--- Forecast for Business Days: 2025-01-02 to 2025-01-23 ---
            Predicted_MF_NAV
2025-01-02           14.0001
2025-01-03           13.9694
2025-01-06           13.9377
2025-01-07           13.9132
2025-01-08           13.8836
2025-01-10           13.8610
2025-01-13           13.8441
2025-01-14           13.8335
2025-01-15           13.8187
2025-01-16           13.8088
2025-01-17           13.8005
2025-01-21           13.7993
2025-01-22           13.7985
2025-01-23           13.7980

1. LSTM results are highly sensitive to lookback (10), units (50), epochs, batch size, and other hyperparameters.
2. The model was trained on a relatively small dataset for LSTMs; overfitting is a risk.
3. Accuracy heavily depends on the Stage 1 prediction of ALL exogenous variables.
4. No extensive hyperparameter tuning or rigorous backtesting was performed.
5. Evaluate these result

In [12]:
X_target.shape

(502, 20, 17)

In [13]:
y_target.shape

(8534, 1)

In [24]:
import pandas as pd
import numpy as np
import warnings
import logging
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm # Or from tqdm import tqdm

# Make sure to install necessary libraries:
# pip install tensorflow pandas numpy scikit-learn tqdm pandas_market_calendars
try:
    import pandas_market_calendars as mcal
except ImportError:
    print("Error: Please install pandas_market_calendars")
    print("Example: pip install pandas_market_calendars")
    exit()

# Suppress warnings and TensorFlow logs
warnings.filterwarnings("ignore")
tf.get_logger().setLevel(logging.ERROR)

# --- Configuration ---
SELECTED_EXOG_COLS = ['ETF_Price', 'ETF_Change %', 'ETF_Vol.', 'USD_Price','USD_Change %','Gold_Volume'] # As per clarification above
TARGET_COL = 'MF_NAV'
N_STEPS_IN = 10       # Lookback window (business days) - Hyperparameter
N_FORECAST_BDAYS = 15 # Number of business days to predict
EPOCHS_EXOG = 60      # More epochs might be needed, adjust based on loss plots
BATCH_SIZE_EXOG = 32
EPOCHS_TARGET = 60    # More epochs might be needed, adjust based on loss plots
BATCH_SIZE_TARGET = 32
LSTM_UNITS = 64       # Increased units slightly
DROPOUT_RATE = 0.2

# --- 1. Load Data ---
try:
    df_orig = pd.read_csv("UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv")
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv not found.")
    exit()
except Exception as e:
    print(f"An error occurred during loading: {e}")
    exit()

# --- 2. Preprocessing ---
df = df_orig.copy()
try:
    print("Starting preprocessing...")
    df['All_Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
    df.sort_values('All_Date', inplace=True)

    # Define cleaning functions
    def parse_volume(value):
        if isinstance(value, (int, float)): return value
        if isinstance(value, str):
            value = value.replace(',', '').strip().upper()
            if 'K' in value: return float(value.replace('K', '')) * 1000
            if 'M' in value: return float(value.replace('M', '')) * 1000000
        try: return float(value)
        except (ValueError, TypeError): return np.nan
    def parse_percentage(value):
        if isinstance(value, (int, float)): return value
        if isinstance(value, str): value = value.replace('%', '').strip()
        try: return float(value)
        except (ValueError, TypeError): return np.nan

    # Apply cleaning
    cols_to_clean = {'ETF_Vol.': parse_volume, 'ETF_Change %': parse_percentage, 'USD_Change %': parse_percentage}
    for col, func in cols_to_clean.items():
        if col in df.columns: df[col] = df[col].apply(func)
    # Ensure all columns used are numeric
    all_cols_to_use = [TARGET_COL] + SELECTED_EXOG_COLS
    for col in df.columns: # Check all original columns for conversion
        if col != 'All_Date': df[col] = pd.to_numeric(df[col], errors='coerce')

    # Select only necessary columns BEFORE setting index and resampling
    df_selected = df[['All_Date'] + all_cols_to_use].copy()

    # Set index and resample to Business Days
    df_proc = df_selected.set_index('All_Date').copy()
    df_proc = df_proc.resample('B').last() # Use business day frequency
    print(f"Resampled data to business days. Shape before NaN handling: {df_proc.shape}")

    # Handle NaNs after resampling
    df_proc = df_proc.ffill().bfill() # Chain ffill and bfill
    if df_proc.isnull().sum().sum() > 0:
        print(f"NaNs remaining after fill:\n{df_proc.isnull().sum()}")
        df_proc.dropna(inplace=True)
        if df_proc.isnull().sum().sum() > 0: raise ValueError("NaNs persist after cleaning")

    print(f"Shape after NaN handling: {df_proc.shape}")
    if len(df_proc) < N_STEPS_IN * 2:
         raise ValueError(f"Insufficient data ({len(df_proc)} days) for lookback ({N_STEPS_IN})")

    # Ensure correct column order
    df_proc = df_proc[[TARGET_COL] + SELECTED_EXOG_COLS]

    print("Preprocessing complete.")

except Exception as e:
    print(f"An error occurred during preprocessing: {e}")
    exit()

# --- 3. Scaling ---
print("Scaling data...")
scaler_target = MinMaxScaler(feature_range=(0, 1))
scaler_exog = MinMaxScaler(feature_range=(0, 1))

# Scale target and exogenous features separately
scaled_target = scaler_target.fit_transform(df_proc[[TARGET_COL]])
scaled_exog = scaler_exog.fit_transform(df_proc[SELECTED_EXOG_COLS])

# Combine scaled data for sequence creation in Stage 2
scaled_data_combined = np.concatenate((scaled_target, scaled_exog), axis=1)
print("Data scaling complete.")

# --- 4. Data Structuring for LSTM ---
def create_sequences(input_data, n_steps_in, n_steps_out=1):
    """Creates sequences for LSTM."""
    X, y = [], []
    n_samples = len(input_data)
    for i in range(n_samples):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > n_samples:
            break
        seq_x = input_data[i:end_ix, :]
        seq_y = input_data[end_ix:out_end_ix, :] # Output includes all features for the step(s)
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# --- 5. Stage 1: Predict Future SELECTED Exogenous Variables ---
print("\n--- Stage 1: Predicting Future Selected Exogenous Variables (Multivariate LSTM) ---")
n_selected_exog_features = len(SELECTED_EXOG_COLS)

# Structure data for Exogenous prediction LSTM (using only exog data)
X_exog, y_exog_full = create_sequences(scaled_exog, N_STEPS_IN, n_steps_out=1)
# Output should be the next step's values for the selected exog features
y_exog = y_exog_full[:, 0, :] # Shape: (n_samples, n_selected_exog_features)

print(f"Exogenous data shapes: X={X_exog.shape}, y={y_exog.shape}")

# Build Stage 1 LSTM Model
model_exog = Sequential([
    LSTM(LSTM_UNITS, activation='relu', input_shape=(N_STEPS_IN, n_selected_exog_features), return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_UNITS, activation='relu', return_sequences=False),
    Dropout(DROPOUT_RATE),
    Dense(n_selected_exog_features)
])
model_exog.compile(optimizer='adam', loss='mse')

# Train Stage 1 Model
print(f"Training Stage 1 LSTM for {EPOCHS_EXOG} epochs...")
history_exog = model_exog.fit(X_exog, y_exog, epochs=EPOCHS_EXOG, batch_size=BATCH_SIZE_EXOG, validation_split=0.1, verbose=0, shuffle=False)
print("Stage 1 Training complete.")
print(f"  Final Training Loss (Exog): {history_exog.history['loss'][-1]:.4f}")
print(f"  Final Validation Loss (Exog): {history_exog.history['val_loss'][-1]:.4f}")


# --- Iterative Forecasting for Exogenous Variables ---
print(f"Generating {N_FORECAST_BDAYS} steps of future exogenous predictions...")
last_sequence_exog_scaled = scaled_exog[-N_STEPS_IN:]
current_batch_exog = last_sequence_exog_scaled.reshape((1, N_STEPS_IN, n_selected_exog_features))
future_exog_preds_scaled_list = []

for i in tqdm(range(N_FORECAST_BDAYS), desc="Exog Forecast Steps"):
    current_pred_exog_scaled = model_exog.predict(current_batch_exog, verbose=0)[0]
    future_exog_preds_scaled_list.append(current_pred_exog_scaled)
    new_batch_entry = current_pred_exog_scaled.reshape((1, 1, n_selected_exog_features))
    current_batch_exog = np.append(current_batch_exog[:, 1:, :], new_batch_entry, axis=1)

future_exog_preds_scaled = np.array(future_exog_preds_scaled_list)

# --- Inverse Transform Exogenous Predictions ---
print("Inverse transforming exogenous predictions...")
future_exog_preds_inv = scaler_exog.inverse_transform(future_exog_preds_scaled)

# Get the business day dates for the forecast period
last_hist_date = df_proc.index.max()
forecast_start_date = last_hist_date + pd.Timedelta(days=1)
estimated_end_date = forecast_start_date + pd.Timedelta(days=N_FORECAST_BDAYS + 7) # Buffer

# -- Business Day Calculation Helper --
nyse = mcal.get_calendar('NYSE')
def get_business_days_func(start_date, end_date):
    try:
        schedule = nyse.schedule(start_date=start_date, end_date=end_date)
        business_days = pd.to_datetime(schedule.index).normalize()
        return business_days
    except Exception as e_bdays:
        print(f"Warning: Error getting NYSE holidays ({e_bdays}). Falling back to standard Bdays.")
        return pd.bdate_range(start=start_date, end=end_date)

forecast_bdates = get_business_days_func(forecast_start_date, estimated_end_date)
forecast_bdates = forecast_bdates[:N_FORECAST_BDAYS] # Select exact number

if len(forecast_bdates) != N_FORECAST_BDAYS:
     print(f"Warning: Could only generate {len(forecast_bdates)} business days in range. Adjusting N_FORECAST_BDAYS.")
     N_FORECAST_BDAYS = len(forecast_bdates)
     future_exog_preds_inv = future_exog_preds_inv[:N_FORECAST_BDAYS] # Trim if needed
     future_exog_preds_scaled = future_exog_preds_scaled[:N_FORECAST_BDAYS] # Trim scaled version too


future_exog_predictions_df = pd.DataFrame(future_exog_preds_inv, index=forecast_bdates, columns=SELECTED_EXOG_COLS)
future_exog_preds_scaled_df = pd.DataFrame(future_exog_preds_scaled, index=forecast_bdates, columns=SELECTED_EXOG_COLS) # Keep scaled version

print("Future selected exogenous predictions generated.")


# --- 6. Stage 2: Predict Future MF_NAV ---
print("\n--- Stage 2: Predicting Future MF_NAV (LSTM with Exog) ---")
n_features_target_stage = 1 + n_selected_exog_features # Target + Selected Exog

# Structure data for MF_NAV prediction LSTM (using combined scaled data)
X_target, y_target_full = create_sequences(scaled_data_combined, N_STEPS_IN, n_steps_out=1)
# Output is only the target column (index 0 of combined data)
y_target = y_target_full[:, 0, 0].reshape(-1, 1) # Shape: (n_samples, 1)

print(f"Target stage data shapes: X={X_target.shape}, y={y_target.shape}")

# Build Stage 2 LSTM Model
model_target = Sequential([
    LSTM(LSTM_UNITS, activation='relu', input_shape=(N_STEPS_IN, n_features_target_stage), return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_UNITS, activation='relu', return_sequences=False),
    Dropout(DROPOUT_RATE),
    Dense(1) # Single output neuron for MF_NAV
])
model_target.compile(optimizer='adam', loss='mse')

# Train Stage 2 Model
print(f"Training Stage 2 LSTM for {EPOCHS_TARGET} epochs...")
history_target = model_target.fit(X_target, y_target, epochs=EPOCHS_TARGET, batch_size=BATCH_SIZE_TARGET, validation_split=0.1, verbose=0, shuffle=False)
print("Stage 2 Training complete.")
print(f"  Final Training Loss (Target): {history_target.history['loss'][-1]:.4f}")
print(f"  Final Validation Loss (Target): {history_target.history['val_loss'][-1]:.4f}")


# --- Iterative Forecasting for MF_NAV ---
print(f"Generating {N_FORECAST_BDAYS} steps of future MF_NAV predictions...")

# Get the last historical sequence (scaled target + scaled exog)
last_sequence_target_stage = scaled_data_combined[-N_STEPS_IN:]
current_batch_target_stage = last_sequence_target_stage.reshape((1, N_STEPS_IN, n_features_target_stage))
future_target_preds_scaled_list = []

for i in tqdm(range(N_FORECAST_BDAYS), desc="Target Forecast Steps"):
    # Predict the next MF_NAV step (scaled)
    current_pred_target_scaled = model_target.predict(current_batch_target_stage, verbose=0)[0]
    future_target_preds_scaled_list.append(current_pred_target_scaled)

    # Prepare the next input sequence if not the last step
    if i < N_FORECAST_BDAYS - 1:
        # Get the *predicted* scaled exogenous variables for the *next* time step
        next_exog_scaled = future_exog_preds_scaled_df.iloc[i].values # Note: index i corresponds to the prediction for step i+1 relative to start

        # Create the new entry for the sequence: [predicted_NAV, next_predicted_Exog1, ...]
        new_sequence_entry = np.concatenate([current_pred_target_scaled, next_exog_scaled]).reshape((1, 1, n_features_target_stage))

        # Update the batch: remove first step, append new entry
        current_batch_target_stage = np.append(current_batch_target_stage[:, 1:, :], new_sequence_entry, axis=1)


future_target_preds_scaled = np.array(future_target_preds_scaled_list)

# --- Inverse Transform Target Predictions ---
print("Inverse transforming target (MF_NAV) predictions...")
mf_nav_final_forecast_inv = scaler_target.inverse_transform(future_target_preds_scaled)

# --- 7. Display Final Forecast ---
final_forecast_df = pd.DataFrame({
    'Predicted_MF_NAV': mf_nav_final_forecast_inv.flatten()
    }, index=forecast_bdates)

print("\n--- FINAL Predicted MF_NAV using Two-Stage LSTM ---")
print(f"--- (Exog: {', '.join(SELECTED_EXOG_COLS)}) ---")
print(f"--- Forecast for Business Days: {forecast_bdates.min().date()} to {forecast_bdates.max().date()} ---")
pd.options.display.float_format = '{:.4f}'.format
print(final_forecast_df)

print("\n--- LSTM MODEL WARNINGS ---")
print(f"1. LSTM results depend heavily on hyperparameters (lookback={N_STEPS_IN}, units={LSTM_UNITS}, epochs, etc.) and data scaling.")
print("2. Trained on a relatively small dataset; overfitting is possible.")
print("3. Accuracy relies on Stage 1 predictions for exogenous variables.")
print("4. No extensive hyperparameter tuning performed.")
print("5. Evaluate critically; consider backtesting and comparison with simpler models.")

Data loaded successfully.
Starting preprocessing...
Resampled data to business days. Shape before NaN handling: (522, 7)
Shape after NaN handling: (522, 7)
Preprocessing complete.
Scaling data...
Data scaling complete.

--- Stage 1: Predicting Future Selected Exogenous Variables (Multivariate LSTM) ---
Exogenous data shapes: X=(512, 10, 6), y=(512, 6)
Training Stage 1 LSTM for 60 epochs...
Stage 1 Training complete.
  Final Training Loss (Exog): 0.0107
  Final Validation Loss (Exog): 0.0102
Generating 15 steps of future exogenous predictions...


Exog Forecast Steps:   0%|          | 0/15 [00:00<?, ?it/s]

Inverse transforming exogenous predictions...
Future selected exogenous predictions generated.

--- Stage 2: Predicting Future MF_NAV (LSTM with Exog) ---
Target stage data shapes: X=(512, 10, 7), y=(512, 1)
Training Stage 2 LSTM for 60 epochs...
Stage 2 Training complete.
  Final Training Loss (Target): 0.0033
  Final Validation Loss (Target): 0.0027
Generating 14 steps of future MF_NAV predictions...


Target Forecast Steps:   0%|          | 0/14 [00:00<?, ?it/s]

Inverse transforming target (MF_NAV) predictions...

--- FINAL Predicted MF_NAV using Two-Stage LSTM ---
--- (Exog: ETF_Price, ETF_Change %, ETF_Vol., USD_Price, USD_Change %, Gold_Volume) ---
--- Forecast for Business Days: 2025-01-02 to 2025-01-23 ---
            Predicted_MF_NAV
2025-01-02           14.8549
2025-01-03           14.8487
2025-01-06           14.8502
2025-01-07           14.8506
2025-01-08           14.8491
2025-01-10           14.8472
2025-01-13           14.8421
2025-01-14           14.8321
2025-01-15           14.8174
2025-01-16           14.8016
2025-01-17           14.7824
2025-01-21           14.7682
2025-01-22           14.7524
2025-01-23           14.7351

1. LSTM results depend heavily on hyperparameters (lookback=10, units=64, epochs, etc.) and data scaling.
2. Trained on a relatively small dataset; overfitting is possible.
3. Accuracy relies on Stage 1 predictions for exogenous variables.
4. No extensive hyperparameter tuning performed.
5. Evaluate criticall