In [46]:
!pip install numpy pandas scikit-learn ta


Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ta
  Building wheel for ta (setup.py): started
  Building wheel for ta (setup.py): finished with status 'done'
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29421 sha256=bdaac113d17df7da28bafe5e07b5582f4c41d1041e7ed85322e78df3f108e581
  Stored in directory: c:\users\keert\appdata\local\pip\cache\wheels\a1\d7\29\7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0




In [3]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0




In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import ta  # For technical indicators
from datetime import timedelta

# 1. Load & parse
df = pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv')
df['Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
df = df.sort_values('Date').set_index('Date')

# 2. Clean numeric columns
def parse_percent(x):
    if pd.isna(x): return np.nan
    return float(x.strip('%')) / 100

def parse_vol(x):
    if pd.isna(x): return np.nan
    if isinstance(x, str):
        if x.endswith('K'): return float(x[:-1]) * 1e3
        if x.endswith('M'): return float(x[:-1]) * 1e6
    try: return float(x)
    except: return np.nan

df['ETF_Change'] = df['ETF_Change'].apply(parse_percent)
df['USD_Change'] = df['USD_Change'].apply(parse_percent)
df['ETF_Vol'] = df['ETF_Vol'].apply(parse_vol)
df['Gold_Volume'] = df['Gold_Volume'].apply(parse_vol)

# Convert other numerics
num_cols = ['MF_NAV','ETF_Price','ETF_Open','ETF_High','ETF_Low',
            'USD_Price','USD_Open','USD_High','USD_Low',
            'Gold_Open','Gold_High','Gold_Low','Gold_Close']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

#df = df.fillna(method='ffill').dropna()
from statsmodels.tsa.statespace.structural import UnobservedComponents

# Apply Kalman smoothing to all numeric columns with missing data
for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            continue  # fallback to skip if fitting fails


# 3. Add technical indicators
df['SMA_10'] = ta.trend.sma_indicator(df['ETF_Price'], window=10)
df['EMA_10'] = ta.trend.ema_indicator(df['ETF_Price'], window=10)
macd = ta.trend.macd(df['ETF_Price'])
df['MACD'] = macd
bb = ta.volatility.BollingerBands(df['ETF_Price'], window=20)
df['BB_High'] = bb.bollinger_hband()
df['BB_Low'] = bb.bollinger_lband()

# 4. Add lag features
for lag in [1, 3, 7, 14,30]:
    df[f'MF_NAV_lag{lag}'] = df['MF_NAV'].shift(lag)

df = df.dropna()

# 5. Feature set
features = [
    'ETF_Price', 'ETF_Vol', 'USD_Price', 'Gold_Close', 'Gold_Volume',
    'SMA_10', 'EMA_10', 'MACD', 'BB_High', 'BB_Low',
    'MF_NAV_lag1', 'MF_NAV_lag3', 'MF_NAV_lag7', 'MF_NAV_lag14'
]
target = 'MF_NAV'

X = df[features]
y = df[target]

# 6. Scale
scaler_x = StandardScaler().fit(X)
X_scaled = scaler_x.transform(X)
scaler_y = StandardScaler().fit(y.values.reshape(-1,1))
y_scaled = scaler_y.transform(y.values.reshape(-1,1)).flatten()

# 7. Train/test split
X_train = X_scaled[:-11]
y_train = y_scaled[:-11]
X_test = X_scaled[-11:]
y_test = y[-11:].values

# 8. Fit SVR (Cascaded SVM)
svm = SVR(C=1.0, kernel='poly', epsilon=0.1,degree=3,gamma='scale')
svm.fit(X_train, y_train)

# 9. Predict Jan 1–15, 2025 (business days)
future_dates = pd.bdate_range('2025-01-01', '2025-01-15')
last_known = df.iloc[-1]

# Extend dataframe for prediction
forecast_df = pd.DataFrame(index=future_dates, columns=df.columns)
for col in df.columns:
    forecast_df[col] = last_known[col]

df_all = pd.concat([df, forecast_df])
for lag in [1,3,7,14,30]:
    df_all[f'MF_NAV_lag{lag}'] = df_all['MF_NAV'].shift(lag)

# Predict step-by-step
predictions = []
for date in future_dates:
    row = df_all.loc[date]
    x_f = row[features].values.reshape(1, -1)
    x_f_scaled = scaler_x.transform(x_f)
    y_pred_scaled = svm.predict(x_f_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).item()
    predictions.append(y_pred)
    df_all.at[date, 'MF_NAV'] = y_pred
    # Update lag features for next day
    for lag in [1,3,7,14]:
        prev_date = date - pd.Timedelta(days=lag)
        if prev_date in df_all.index:
            df_all.at[date, f'MF_NAV_lag{lag}'] = df_all.at[prev_date, 'MF_NAV']

# 10. Display
output = pd.DataFrame({
    'Date': future_dates.strftime('%Y-%m-%d'),
    'Predicted_NAV': np.round(predictions, 4)
})
print(output)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


          Date  Predicted_NAV
0   2025-01-01        14.6621
1   2025-01-02        14.5409
2   2025-01-03        14.6754
3   2025-01-06        14.6854
4   2025-01-07        14.7101
5   2025-01-08        14.6977
6   2025-01-09        14.7580
7   2025-01-10        14.8114
8   2025-01-13        14.7403
9   2025-01-14        14.7766
10  2025-01-15        14.7469


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [62]:
y_pred_test = scaler_y.inverse_transform(svm.predict(X_test).reshape(-1,1)).flatten()
print("Backtest MAE:", mean_absolute_error(y_test, y_pred_test))
print("Backtest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))


Backtest MAE: 0.21148356327586415
Backtest RMSE: 0.2384069071193449


In [6]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.structural import UnobservedComponents
from sklearn.metrics import mean_squared_error, mean_absolute_error
import ta
from datetime import timedelta

# 1. Load & parse
df = pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv')
df['Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
df = df.sort_values('Date').set_index('Date')

# 2. Clean numeric columns
def parse_percent(x):
    if pd.isna(x): return np.nan
    return float(x.strip('%')) / 100

def parse_vol(x):
    if pd.isna(x): return np.nan
    if isinstance(x, str):
        if x.endswith('K'): return float(x[:-1]) * 1e3
        if x.endswith('M'): return float(x[:-1]) * 1e6
    try: return float(x)
    except: return np.nan

df['ETF_Change'] = df['ETF_Change'].apply(parse_percent)
df['USD_Change'] = df['USD_Change'].apply(parse_percent)
df['ETF_Vol'] = df['ETF_Vol'].apply(parse_vol)
df['Gold_Volume'] = df['Gold_Volume'].apply(parse_vol)

# Convert other numerics
num_cols = ['MF_NAV','ETF_Price','ETF_Open','ETF_High','ETF_Low',
            'USD_Price','USD_Open','USD_High','USD_Low',
            'Gold_Open','Gold_High','Gold_Low','Gold_Close']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# --- Kalman smoothing (before lag features) ---
for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            continue

# 3. Add technical indicators
df['SMA_10'] = ta.trend.sma_indicator(df['ETF_Price'], window=10)
df['EMA_10'] = ta.trend.ema_indicator(df['ETF_Price'], window=10)
macd = ta.trend.macd(df['ETF_Price'])
df['MACD'] = macd
bb = ta.volatility.BollingerBands(df['ETF_Price'], window=20)
df['BB_High'] = bb.bollinger_hband()
df['BB_Low'] = bb.bollinger_lband()

# 4. Add lag features
for lag in [1, 3, 7, 14, 30]:
    df[f'MF_NAV_lag{lag}'] = df['MF_NAV'].shift(lag)

# --- Kalman smoothing again (after lagging) ---
for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            df[col] = df[col].fillna(method='bfill')

# 5. Feature set
features = [
    'ETF_Price', 'ETF_Vol', 'USD_Price', 'Gold_Close', 'Gold_Volume',
    'SMA_10', 'EMA_10', 'MACD', 'BB_High', 'BB_Low',
    'MF_NAV_lag1', 'MF_NAV_lag3', 'MF_NAV_lag7', 'MF_NAV_lag14', 'MF_NAV_lag30'
]
target = 'MF_NAV'

X = df[features]
y = df[target]

# 6. Scaling
scaler_x = StandardScaler().fit(X)
X_scaled = scaler_x.transform(X)

scaler_y = StandardScaler().fit(y.values[:-11].reshape(-1,1))  # only training data
y_scaled = scaler_y.transform(y.values.reshape(-1,1)).flatten()

# 7. Train/test split
X_train = X_scaled[:-11]
y_train = y_scaled[:-11]
X_test = X_scaled[-11:]
y_test = y[-11:].values

# 8. Fit SVR
svm = SVR(C=1.0, kernel='poly', epsilon=0.1, degree=3, gamma='scale')
svm.fit(X_train, y_train)

# 9. Predict Jan 1–15, 2025
future_dates = pd.bdate_range('2025-01-01', '2025-01-15')
last_known = df.iloc[-1]
forecast_df = pd.DataFrame(index=future_dates, columns=df.columns)
for col in df.columns:
    forecast_df[col] = last_known[col]

df_all = pd.concat([df, forecast_df])
for lag in [1,3,7,14,30]:
    df_all[f'MF_NAV_lag{lag}'] = df_all['MF_NAV'].shift(lag)

# Predict step-by-step
predictions = []
for date in future_dates:
    row = df_all.loc[date]
    x_f = row[features].values.reshape(1, -1)
    x_f_scaled = scaler_x.transform(x_f)
    y_pred_scaled = svm.predict(x_f_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).item()
    predictions.append(y_pred)
    df_all.at[date, 'MF_NAV'] = y_pred
    # Update lag features for next day
    for lag in [1,3,7,14,30]:
        prev_date = date - pd.Timedelta(days=lag)
        if prev_date in df_all.index:
            df_all.at[date, f'MF_NAV_lag{lag}'] = df_all.at[prev_date, 'MF_NAV']

# 10. Output
output = pd.DataFrame({
    'Date': future_dates.strftime('%Y-%m-%d'),
    'Predicted_NAV': np.round(predictions, 4)
})
print(output)



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


          Date  Predicted_NAV
0   2025-01-01        14.6225
1   2025-01-02        14.6411
2   2025-01-03        14.6663
3   2025-01-06        14.7005
4   2025-01-07        14.6982
5   2025-01-08        14.6632
6   2025-01-09        14.6893
7   2025-01-10        14.6952
8   2025-01-13        14.6947
9   2025-01-14        14.6881
10  2025-01-15        14.6912


In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.structural import UnobservedComponents
import ta
from datetime import timedelta

# 1. Load & parse
df = pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv')
df['Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
df = df.sort_values('Date').set_index('Date')

# 2. Clean numeric columns
def parse_percent(x):
    if pd.isna(x): return np.nan
    return float(x.strip('%')) / 100

def parse_vol(x):
    if pd.isna(x): return np.nan
    if isinstance(x, str):
        if x.endswith('K'): return float(x[:-1]) * 1e3
        if x.endswith('M'): return float(x[:-1]) * 1e6
    try: return float(x)
    except: return np.nan

df['ETF_Change'] = df['ETF_Change'].apply(parse_percent)
df['USD_Change'] = df['USD_Change'].apply(parse_percent)
df['ETF_Vol'] = df['ETF_Vol'].apply(parse_vol)
df['Gold_Volume'] = df['Gold_Volume'].apply(parse_vol)

num_cols = ['MF_NAV','ETF_Price','ETF_Open','ETF_High','ETF_Low',
            'USD_Price','USD_Open','USD_High','USD_Low',
            'Gold_Open','Gold_High','Gold_Low','Gold_Close']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Kalman smoothing BEFORE lagging
for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            continue

# 4. Technical indicators
df['SMA_10'] = ta.trend.sma_indicator(df['ETF_Price'], window=10)
df['EMA_10'] = ta.trend.ema_indicator(df['ETF_Price'], window=10)
df['MACD'] = ta.trend.macd(df['ETF_Price'])
bb = ta.volatility.BollingerBands(df['ETF_Price'], window=20)
df['BB_High'] = bb.bollinger_hband()
df['BB_Low'] = bb.bollinger_lband()

# 5. Lag features
for lag in [1, 3, 7, 14, 30]:
    df[f'MF_NAV_lag{lag}'] = df['MF_NAV'].shift(lag)

# 6. Kalman smoothing AFTER lagging
for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            df[col] = df[col].fillna(method='bfill')

# 7. Features & target
features = [
    'ETF_Price', 'ETF_Vol', 'USD_Price', 'Gold_Close', 'Gold_Volume',
    'SMA_10', 'EMA_10', 'MACD', 'BB_High', 'BB_Low',
    'MF_NAV_lag1', 'MF_NAV_lag3', 'MF_NAV_lag7', 'MF_NAV_lag14', 'MF_NAV_lag30'
]
target = 'MF_NAV'

X = df[features]
y = df[target]

# 8. Scale
scaler_x = StandardScaler().fit(X)
X_scaled = scaler_x.transform(X)
scaler_y = StandardScaler().fit(y.values[:-11].reshape(-1, 1))  # only training set
y_scaled = scaler_y.transform(y.values.reshape(-1, 1)).flatten()

# 9. Train XGBoost
X_train = X_scaled[:-11]
y_train = y_scaled[:-11]

model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# 10. Forecast Jan 1–15, 2025
future_dates = pd.bdate_range('2025-01-01', '2025-01-15')
forecast_df = pd.DataFrame(index=future_dates, columns=df.columns)
df_all = pd.concat([df, forecast_df])

for lag in [1,3,7,14,30]:
    df_all[f'MF_NAV_lag{lag}'] = df_all['MF_NAV'].shift(lag)

predictions = []

for date in future_dates:
    idx = df_all.index.get_loc(date)
    history = df_all.iloc[:idx].copy()

    # Dynamic recomputation of technical indicators
    df_all.at[date, 'SMA_10'] = history['ETF_Price'].rolling(10).mean().iloc[-1]
    df_all.at[date, 'EMA_10'] = history['ETF_Price'].ewm(span=10).mean().iloc[-1]
    macd_line = history['ETF_Price'].ewm(span=12).mean() - history['ETF_Price'].ewm(span=26).mean()
    df_all.at[date, 'MACD'] = macd_line.iloc[-1]
    mid = history['ETF_Price'].rolling(20).mean().iloc[-1]
    std = history['ETF_Price'].rolling(20).std().iloc[-1]
    df_all.at[date, 'BB_High'] = mid + 2 * std
    df_all.at[date, 'BB_Low'] = mid - 2 * std

    # Update lag features from previous NAVs
    for lag in [1, 3, 7, 14, 30]:
        prev_date = date - pd.Timedelta(days=lag)
        if prev_date in df_all.index:
            df_all.at[date, f'MF_NAV_lag{lag}'] = df_all.at[prev_date, 'MF_NAV']

    # Prediction
    row = df_all.loc[date]
    x_f = row[features].values.reshape(1, -1)
    x_f_scaled = scaler_x.transform(x_f)
    y_pred_scaled = model.predict(x_f_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).item()
    predictions.append(y_pred)

    # Store prediction
    df_all.at[date, 'MF_NAV'] = y_pred

# 11. Output
output = pd.DataFrame({
    'Date': future_dates.strftime('%Y-%m-%d'),
    'Predicted_NAV': np.round(predictions, 4)
})
print(output)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


          Date  Predicted_NAV
0   2025-01-01        15.4692
1   2025-01-02        15.4692
2   2025-01-03        15.4692
3   2025-01-06        15.4692
4   2025-01-07        15.4692
5   2025-01-08        15.4692
6   2025-01-09        15.4692
7   2025-01-10        15.4692
8   2025-01-13        15.4692
9   2025-01-14        15.4692
10  2025-01-15        15.4692


  df_all = pd.concat([df, forecast_df])


In [11]:
# This version keeps your original preprocessing (Kalman smoothing, features, scaling)
# and replaces the XGBoost model with a Hybrid Transformer + GRU model for sequence prediction

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.structural import UnobservedComponents
import ta
from datetime import timedelta

# --- Step 1: Load & Preprocess Data (same as your version) ---
df = pd.read_csv('UTI_Gold_ETF_MF_USD_01012023_31122024_ALL.csv')
df['Date'] = pd.to_datetime(df['All_Date'], format='%m-%d-%Y')
df = df.sort_values('Date').set_index('Date')

def parse_percent(x):
    if pd.isna(x): return np.nan
    return float(x.strip('%')) / 100

def parse_vol(x):
    if pd.isna(x): return np.nan
    if isinstance(x, str):
        if x.endswith('K'): return float(x[:-1]) * 1e3
        if x.endswith('M'): return float(x[:-1]) * 1e6
    try: return float(x)
    except: return np.nan

df['ETF_Change'] = df['ETF_Change'].apply(parse_percent)
df['USD_Change'] = df['USD_Change'].apply(parse_percent)
df['ETF_Vol'] = df['ETF_Vol'].apply(parse_vol)
df['Gold_Volume'] = df['Gold_Volume'].apply(parse_vol)

num_cols = ['MF_NAV','ETF_Price','ETF_Open','ETF_High','ETF_Low',
            'USD_Price','USD_Open','USD_High','USD_Low',
            'Gold_Open','Gold_High','Gold_Low','Gold_Close']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            continue

# Technical indicators
df['SMA_10'] = ta.trend.sma_indicator(df['ETF_Price'], window=10)
df['EMA_10'] = ta.trend.ema_indicator(df['ETF_Price'], window=10)
df['MACD'] = ta.trend.macd(df['ETF_Price'])
bb = ta.volatility.BollingerBands(df['ETF_Price'], window=20)
df['BB_High'] = bb.bollinger_hband()
df['BB_Low'] = bb.bollinger_lband()

for lag in [1, 3, 7, 14, 30]:
    df[f'MF_NAV_lag{lag}'] = df['MF_NAV'].shift(lag)

for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype != 'object':
        try:
            model = UnobservedComponents(df[col], level='llevel')
            result = model.fit(disp=False)
            df[col] = df[col].fillna(result.fittedvalues)
        except:
            df[col] = df[col].fillna(method='bfill')

features = [
    'ETF_Price', 'ETF_Vol', 'USD_Price', 'Gold_Close', 'Gold_Volume',
    'SMA_10', 'EMA_10', 'MACD', 'BB_High', 'BB_Low',
    'MF_NAV_lag1', 'MF_NAV_lag3', 'MF_NAV_lag7', 'MF_NAV_lag14', 'MF_NAV_lag30'
]
target = 'MF_NAV'

X = df[features]
y = df[target]

scaler_x = StandardScaler().fit(X)
X_scaled = pd.DataFrame(scaler_x.transform(X), columns=features, index=X.index)
scaler_y = StandardScaler().fit(y.values[:-11].reshape(-1, 1))
y_scaled = scaler_y.transform(y.values.reshape(-1, 1)).flatten()

# --- Step 2: Dataset and Model ---
class HybridDataset(Dataset):
    def __init__(self, X, y, seq_len=60, horizon=11):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.reshape(-1,1), dtype=torch.float32)
        self.seq_len = seq_len
        self.horizon = horizon

    def __len__(self):
        return len(self.X) - self.seq_len - self.horizon + 1

    def __getitem__(self, idx):
        return (self.X[idx:idx+self.seq_len],
                self.y[idx+self.seq_len:idx+self.seq_len+self.horizon])

class HybridTransGRU(nn.Module):
    def __init__(self, feat_dim, d_model=64, nhead=4, nlayers=2,
                 gru_h=64, horizon=11, dropout=0.1):
        super().__init__()
        self.horizon = horizon
        self.fc_in = nn.Linear(feat_dim, d_model)
        self.pos_enc = nn.Parameter(self._pos_enc(500, d_model), requires_grad=False)
        enc_layer = nn.TransformerEncoderLayer(d_model, nhead, d_model*4, dropout)
        self.encoder = nn.TransformerEncoder(enc_layer, nlayers)
        self.gru = nn.GRU(d_model, gru_h, batch_first=True)
        self.fc_out = nn.Linear(gru_h, 1)

    def _pos_enc(self, L, d):
        pe = torch.zeros(L, d)
        pos = torch.arange(L).unsqueeze(1)
        div = torch.exp(torch.arange(0, d, 2)*(-np.log(10000.)/d))
        pe[:,0::2] = torch.sin(pos*div)
        pe[:,1::2] = torch.cos(pos*div)
        return pe

    def forward(self, x):
        b, T, _ = x.size()
        x = self.fc_in(x) + self.pos_enc[:T].unsqueeze(0)
        x_enc = self.encoder(x.permute(1,0,2)).permute(1,0,2)
        gru_out, _ = self.gru(x_enc)
        return self.fc_out(gru_out[:, -self.horizon:, :])

# --- Step 3: Train Model ---
val_days = 89
X_tr, X_va = X_scaled.iloc[:-val_days], X_scaled.iloc[-val_days:]
y_tr, y_va = y_scaled[:-val_days], y_scaled[-val_days:]

train_ds = HybridDataset(X_tr, y_tr)
val_ds = HybridDataset(X_va, y_va)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridTransGRU(len(features), horizon=11).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(1, 31):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        loss = criterion(model(xb), yb)
        opt.zero_grad(); loss.backward(); opt.step()
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            val_losses.append(criterion(model(xb.to(device)), yb.to(device)).item())
    print(f"Epoch {epoch}: Train MSE {loss.item():.4f}, Val MSE {np.mean(val_losses):.4f}")

# --- Step 4: Forecast Jan 1–15, 2025 ---
model.eval()
seq_len = 60
input_seq = torch.tensor(X_scaled.iloc[-seq_len:].values, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
    pred_scaled = model(input_seq).squeeze(0).cpu().numpy()
pred_navs = scaler_y.inverse_transform(pred_scaled).flatten()
forecast_dates = pd.bdate_range('2025-01-01', '2025-01-15')
forecast_df = pd.DataFrame({
    'Date': forecast_dates.strftime('%Y-%m-%d'),
    'Predicted_NAV': np.round(pred_navs, 4)
})
print(forecast_df)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Epoch 1: Train MSE 0.0558, Val MSE 0.2548
Epoch 2: Train MSE 0.0580, Val MSE 0.2927
Epoch 3: Train MSE 0.0394, Val MSE 0.5315
Epoch 4: Train MSE 0.0315, Val MSE 0.4486
Epoch 5: Train MSE 0.0246, Val MSE 0.5635
Epoch 6: Train MSE 0.0400, Val MSE 0.5755
Epoch 7: Train MSE 0.0201, Val MSE 0.6244
Epoch 8: Train MSE 0.0175, Val MSE 0.7191
Epoch 9: Train MSE 0.0172, Val MSE 0.7494
Epoch 10: Train MSE 0.0345, Val MSE 0.7378
Epoch 11: Train MSE 0.0096, Val MSE 0.9845
Epoch 12: Train MSE 0.0103, Val MSE 0.9500
Epoch 13: Train MSE 0.0127, Val MSE 0.9056
Epoch 14: Train MSE 0.0081, Val MSE 0.9678
Epoch 15: Train MSE 0.0104, Val MSE 0.9448
Epoch 16: Train MSE 0.0102, Val MSE 1.0844
Epoch 17: Train MSE 0.0063, Val MSE 1.1344
Epoch 18: Train MSE 0.0065, Val MSE 1.1033
Epoch 19: Train MSE 0.0062, Val MSE 0.9738
Epoch 20: Train MSE 0.0102, Val MSE 1.0166
Epoch 21: Train MSE 0.0058, Val MSE 1.0108
Epoch 22: Train MSE 0.0049, Val MSE 0.9983
Epoch 23: Train MSE 0.0061, Val MSE 1.0022
Epoch 24: Train MSE 