<h3 style="color:yellow">Using RCABiLSTM</h3>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ReverseCrossAttention(nn.Module):
    """
    RCA (Reverse Cross Attention) focuses attention more on recent time steps,
    which is effective for financial time series.
    """
    def __init__(self, d_model, heads=5):
        super().__init__()
        self.d_model = d_model
        self.heads = heads
        self.d_k = d_model // heads

        self.query = nn.Linear(d_model, d_model)
        self.key   = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.out   = nn.Linear(d_model, d_model)

    def forward(self, x):
        # x: (B, T, D)
        B, T, D = x.shape

        Q = self.query(x)  # (B, T, D)
        K = self.key(torch.flip(x, dims=[1]))  # reverse time (RCA)
        V = self.value(torch.flip(x, dims=[1]))

        # Split into heads
        Q = Q.view(B, T, self.heads, self.d_k).transpose(1, 2)  # (B, H, T, d_k)
        K = K.view(B, T, self.heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)  # (B, H, T, T)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)  # (B, H, T, d_k)

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(B, T, D)
        return self.out(context)


class RCABiLSTM(nn.Module):
    def __init__(self, input_dim=15, hidden_dim=256, lstm_layers=5, dropout=0.2):
        super().__init__()
        self.rca = ReverseCrossAttention(d_model=input_dim, heads=5)
        self.bilstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim,
                              num_layers=lstm_layers, batch_first=True,
                              bidirectional=True, dropout=dropout)
        self.classifier = nn.Sequential(
                nn.Linear(hidden_dim * 2, 1024),  # 256 → 1024
                nn.LayerNorm(1024),
                nn.GELU(),
                nn.Linear(1024, 256),              # ✅ 1024 → 64 (NOT 256)
                nn.GELU(),
                nn.Linear(256, 64),              # ✅ 1024 → 64 (NOT 256)
                nn.GELU(),
                # nn.Linear(1024, 64),      
                # nn.GELU(),         # 1024 → 64
                nn.Dropout(dropout),
                nn.Linear(64, 1)                  # 64 → 1 (final logit)
            )           

    def forward(self, x):
        x = self.rca(x)
        lstm_out, _ = self.bilstm(x)
        x_last = lstm_out[:, -1, :]
        logits = self.classifier(x_last)
        return logits.squeeze(-1)



In [None]:
data = torch.load("Train_Data/options_buy_model_data.pt")
X = data['X']           # shape: (N, 15, 15)
# Y = data['Y']           # shape: (N, 5) — unused here
labels = data['labels'] # shape: (N,)

In [None]:
from torch.utils.data import random_split, DataLoader
from torch.utils.data import Dataset

class BuyDataset(Dataset):
    def __init__(self, X_tensor, labels_tensor):
        self.X = X_tensor.float()
        self.y = labels_tensor.float()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
# Replace NaNs with 0 or a safe default
X = torch.nan_to_num(X, nan=0.0, posinf=10.0, neginf=-10.0)
labels = torch.nan_to_num(labels, nan=0.0)

dataset = BuyDataset(X, labels)

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
generator = torch.Generator().manual_seed(42)
train_set, val_set = random_split(dataset, [train_size, val_size], generator=generator)
train_loader = DataLoader(train_set, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_set, batch_size=1024, shuffle=False)

In [None]:
import torch
import torch.nn as nn

# criterion = BinaryFocalLoss(alpha=0.5, gamma=1.0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RCABiLSTM(input_dim=15).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# Compute pos_weight = (# NO BUY) / (# BUY)
pos_weight = torch.tensor([(451811 / 24792)* 0.2], dtype=torch.float).to(device)
# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


In [None]:
best_val_loss = float('inf')
patience = 3
epochs_without_improvement = 0
num_epochs = 50  # or as you wish
def train_model(model, train_loader, optimizer, criterion, device, log_every=200):
    model.train()
    total_loss = 0
    for i, (X_batch, y_batch) in enumerate(train_loader):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        y_batch = y_batch.float()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
        # 🧾 Log progress every `log_every` batches
        if i % log_every == 0:
            print(f"🧩 Batch {i+1}/{len(train_loader)}: Loss={loss.item():.4f}")

    return total_loss / len(train_loader.dataset)



def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item() * X_batch.size(0)

            preds = (torch.sigmoid(outputs) > 0.5).long()
            correct += (preds == y_batch.long()).sum().item()

    accuracy = correct / len(val_loader.dataset)
    return total_loss / len(val_loader.dataset), accuracy

import os
# Optional: Load previous best model if available
if os.path.exists("best_model.pt"):
    model.load_state_dict(torch.load("best_model.pt"))
    print("📦 Loaded best model checkpoint.")

for epoch in range(1, num_epochs + 1):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate_model(model, val_loader, criterion, device)

    print(f"📅 Epoch {epoch}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

    if val_loss < best_val_loss - 1e-4:  # improvement
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "best_model.pt")
        print("✅ Improvement! Model saved.")
    else:
        epochs_without_improvement += 1
        print(f"⏳ No improvement for {epochs_without_improvement} epoch(s).")

        if epochs_without_improvement >= patience:
            print("⛔ Early stopping: validation loss hasn't improved in 3 epochs.")
            break


In [None]:
# import torch
# import torch.nn as nn

# # criterion = BinaryFocalLoss(alpha=0.5, gamma=1.0)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = RCABiLSTM(input_dim=15).to(device)
# model.eval()

# import os
# # Optional: Load previous best model if available
# if os.path.exists("best_model.pt"):
#     model.load_state_dict(torch.load("best_model.pt"))
#     print("📦 Loaded best model checkpoint.")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# If you're using a DataLoader (val_loader), collect all predictions
all_preds = []
all_labels = []

# model.eval()
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(X_batch)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()

        all_preds.append(preds.cpu())
        all_labels.append(y_batch.cpu().long())

# Concatenate all batches
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)
cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['NO BUY', 'BUY'])
disp.plot(cmap='Blues', values_format='d')
plt.title("📉 Confusion Matrix: RCA-BiLSTM")
plt.show()
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=['NO BUY', 'BUY']))


optuna rcaBilstm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import optuna

# === Load your preprocessed data ===
data = torch.load("Train_Data/options_buy_model_data.pt")
X = torch.nan_to_num(data['X'])
labels = torch.nan_to_num(data['labels'])

# === Prepare dataset and dataloaders ===
from torch.utils.data import Dataset, random_split, WeightedRandomSampler

class BuyDataset(Dataset):
    def __init__(self, X_tensor, labels_tensor):
        self.X = X_tensor.float()
        self.y = labels_tensor.float()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = BuyDataset(X, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])
# Extract indices for train_set
train_indices = train_set.indices if hasattr(train_set, 'indices') else list(range(len(train_set)))

# Subset labels for the training indices only
train_labels = labels[train_indices]

# Create weights for training labels only
sample_weights = torch.where(train_labels == 1, 1.0, 0.05)

# Use sampler on those indices
sampler = WeightedRandomSampler(sample_weights, num_samples=len(train_labels), replacement=True)

train_loader = DataLoader(train_set, batch_size=1024, sampler=sampler)

val_loader = DataLoader(val_set, batch_size=1024)

# === Device ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pos_weight = torch.tensor([(len(labels) - labels.sum()) / labels.sum()], device=device)

# === RCA Module ===
class ReverseCrossAttention(nn.Module):
    def __init__(self, d_model, heads=5):
        super().__init__()
        self.d_k = d_model // heads
        self.heads = heads
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, x):
        B, T, D = x.shape
        Q = self.query(x)
        K = self.key(torch.flip(x, dims=[1]))
        V = self.value(torch.flip(x, dims=[1]))

        Q = Q.view(B, T, self.heads, self.d_k).transpose(1, 2)
        K = K.view(B, T, self.heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(B, T, D)
        return self.out(context)

# === Asymmetric BCE Loss ===
class AsymmetricBCELoss(nn.Module):
    def __init__(self, fn_weight=1.0, fp_weight=5.0):
        super().__init__()
        self.fn_weight = fn_weight
        self.fp_weight = fp_weight

    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        loss_pos = -self.fn_weight * targets * torch.log(probs + 1e-6)
        loss_neg = -self.fp_weight * (1 - targets) * torch.log(1 - probs + 1e-6)
        loss = loss_pos + loss_neg
        return loss.mean()

# === Training helpers ===
def train_model(model, loader, optimizer, criterion):
    model.train()
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item() * len(X_batch)
    return total_loss / len(loader.dataset)

# === Objective function ===
def objective(trial):
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256])
    lstm_layers = trial.suggest_int("lstm_layers", 1, 4)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    mlp_dim = trial.suggest_categorical("mlp_dim", [64, 128, 256])

    class RCAModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.rca = ReverseCrossAttention(d_model=15, heads=5)
            self.bilstm = nn.LSTM(input_size=15, hidden_size=hidden_dim,
                                  num_layers=lstm_layers, batch_first=True,
                                  bidirectional=True, dropout=dropout)
            self.classifier = nn.Sequential(
                nn.Linear(hidden_dim * 2, 1024),
                nn.GELU(),
                nn.Linear(1024, mlp_dim),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(mlp_dim, 1)
            )

        def forward(self, x):
            x = self.rca(x)
            out, _ = self.bilstm(x)
            x_last = out[:, -1, :]
            return self.classifier(x_last).squeeze(-1)

    model = RCAModel().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = AsymmetricBCELoss(fn_weight=1.0, fp_weight=5.0)

    for _ in range(3):
        train_model(model, train_loader, optimizer, criterion)
    val_loss = evaluate_model(model, val_loader, criterion)
    return val_loss

# === Run the study ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# === Print best result ===
print("Best trial:")
print(study.best_trial)


<h3 style="color:yellow">Using lstm cell -inefficient </h3>

In [None]:
import torch
import torch.nn as nn
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# ----------------------------
# Encoder using LSTMCell (manual 15-step)
# ----------------------------
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.lstm_cell = nn.LSTMCell(input_size, hidden_size)

    def forward(self, input_seq):
        batch_size, seq_len, input_size = input_seq.shape
        h = torch.zeros(batch_size, self.hidden_size).to(device)
        c = torch.zeros(batch_size, self.hidden_size).to(device)

        for t in range(seq_len):
            x_t = input_seq[:, t, :]  # shape: (batch, input_size)
            h, c = self.lstm_cell(x_t, (h, c))

        return h, c  # final hidden and cell state

# ----------------------------
# Decoder using LSTMCell (manual 5-step)
# ----------------------------
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.lstm_cell = nn.LSTMCell(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.input_projection = nn.Linear(output_size, input_size)  # 🔧 Fix

    def forward(self, h, c, decoder_input, output_len):
        outputs = []
        for _ in range(output_len):
            h, c = self.lstm_cell(decoder_input, (h, c))
            out = self.output_layer(h)  # shape: (batch_size, output_size)
            outputs.append(out)
            decoder_input = self.input_projection(out)  # 🔧 Project back to input_size
        return torch.stack(outputs, dim=1)  # shape: (batch_size, output_len, output_size)

# ----------------------------
# Seq2Seq wrapper
# ----------------------------
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, output_len):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.output_len = output_len

    def forward(self, src):
        h, c = self.encoder(src)
        decoder_input = src[:, -1, :]  # use last input as first decoder input
        out_seq = self.decoder(h, c, decoder_input, self.output_len)
        return out_seq


In [None]:
import torch
import joblib
# Hyperparams
input_len = 15
output_len = 5
input_size = 10
output_size = 1
hidden_size = 256
batch_size = 32
num_epochs = 50

# Generate and split data
# X, Y = generate_data(n_samples=1200)
X_all_scaled = joblib.load("scaler_X.pkl")
Y_all_scaled = joblib.load("scaler_Y.pkl")
# Ensure X_all_scaled and Y_all_scaled are numpy arrays
X_all_scaled = np.array(X_all_scaled)
Y_all_scaled = np.array(Y_all_scaled)

X, Y = X_all_scaled, Y_all_scaled  # Use the preprocessed data
split_idx = int(len(X) * 0.8)
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_test, Y_test = X[split_idx:], Y[split_idx:]
#Jumbling X_Train and Y_train
perm = np.random.permutation(X_train.shape[0])
X_train, Y_train = X_train[perm], Y_train[perm]
# Convert to tensors
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)
X_train = torch.nan_to_num(X_train)
print("Any NaNs in X_train?", torch.isnan(X_train).any().item())
print("Any NaNs in Y_train?", torch.isnan(Y_train).any().item())
print("Any Infs in X_train?", torch.isinf(X_train).any().item())
print("Any Infs in Y_train?", torch.isinf(Y_train).any().item())

# ========================
# Initialize Model
# ========================
encoder = Encoder(input_size, hidden_size).to(device)
decoder = Decoder(input_size, hidden_size, output_size).to(device)
model = Seq2Seq(encoder, decoder, output_len).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
# Training
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        x_batch = X_train[i:i + batch_size]
        y_batch = Y_train[i:i + batch_size]

        pred = model(x_batch)

        y_batch = y_batch.unsqueeze(-1)  # ➕ Add this to match pred's shape
        loss = criterion(pred, y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 👈 Add this line
        optimizer.step()

        total_loss += loss.item()
        print(f"Batch {i // batch_size + 1}, Loss: {loss.item():.6f}", end='\r')
    num_batches = len(X_train) // batch_size
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / num_batches:.6f}")
print("\nTraining complete.\n")


<h3 style="Color:Yellow">Custom Model</h3>

In [None]:
import joblib  # or just `import joblib` depending on version
import numpy as np

# Load scaler
scaler_Y = joblib.load("Train_Data/scaler_Y.pkl")

model.eval()

with torch.no_grad():
    for i, (X_batch, Y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        preds = model(X_batch)

        # Reshape for unscaling
        preds_np = preds.cpu().numpy().squeeze(-1)
        targets_np = Y_batch.cpu().numpy()  # already (batch_size, pred_len)

        # Unscale
        preds_unscaled = scaler_Y.inverse_transform(preds_np).reshape(Y_batch.shape[0], -1)
        targets_unscaled = scaler_Y.inverse_transform(targets_np).reshape(Y_batch.shape[0], -1)

        # Print a few samples from this batch
        print(f"\n🟦 Batch {i+1}")
        for j in range(min(3, len(preds_unscaled))):  # Only show top 3 per batch for readability
            print(f"Sample {j+1} — True: {targets_unscaled[j]}, Predicted: {preds_unscaled[j]}")


In [None]:
import plotly.graph_objects as go
# ========================
# Testing and Plotting
# ========================
model.eval()
with torch.no_grad():
    preds = model(X_test[:5])  # shape: (5, 5, 1)

for i in range(5):
    past = X_test[i].cpu().numpy().flatten()
    true_future = Y_test[i].cpu().numpy().flatten()
    predicted_future = preds[i].cpu().numpy().flatten()

    fig = go.Figure()
    fig.add_trace(go.Scatter(y=past, mode='lines+markers', name='Past Input', line=dict(color='blue')))
    fig.add_trace(go.Scatter(y=true_future, mode='lines+markers', name='True Future', line=dict(color='green')))
    fig.add_trace(go.Scatter(y=predicted_future, mode='lines+markers', name='Predicted Future', line=dict(color='red', dash='dash')))
    
    fig.update_layout(
        title=f"Option Price Forecast Sample {i+1}",
        xaxis_title="Time Step (0–14: past, 15–19: future)",
        yaxis_title="Price (Normalized)",
        template="plotly_white"
    )
    fig.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


# Define the model again for completeness
class CustomEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, seq_len):
        super(CustomEncoder, self).__init__()
        self.lstm_cells = nn.ModuleList([nn.LSTMCell(input_dim, hidden_dim) for _ in range(seq_len)])

    def forward(self, input_seq):
        h = torch.zeros(input_seq.size(0), hidden_dim).to(input_seq.device)
        c = torch.zeros(input_seq.size(0), hidden_dim).to(input_seq.device)
        for t in range(len(self.lstm_cells)):
            h, c = self.lstm_cells[t](input_seq[:, t, :], (h, c))
        return h, c

class CustomDecoder(nn.Module):
    def __init__(self, hidden_dim, output_dim, pred_len):
        super(CustomDecoder, self).__init__()
        self.lstm_cells = nn.ModuleList([nn.LSTMCell(output_dim, hidden_dim) for _ in range(pred_len)])
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, h, c, target_seq=None, teacher_forcing_ratio=0.0):
        outputs = []
        inp = torch.zeros(h.size(0), 1).to(h.device)  # initial input
        for t, cell in enumerate(self.lstm_cells):
            h, c = cell(inp, (h, c))
            out = self.output_layer(h)
            outputs.append(out.unsqueeze(1))
            if target_seq is not None and torch.rand(1).item() < teacher_forcing_ratio:
                inp = target_seq[:, t].unsqueeze(1)
            else:
                inp = out
        return torch.cat(outputs, dim=1)
class MemoryNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super(MemoryNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, hidden_dim)
        )

    def forward(self, h):
        return self.net(h)
class Seq2SeqLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len, pred_len):
        super(Seq2SeqLSTM, self).__init__()
        self.encoder = CustomEncoder(input_dim, hidden_dim, seq_len)
        self.memory = MemoryNetwork(hidden_dim)
        self.decoder = CustomDecoder(hidden_dim, output_dim, pred_len)

    def forward(self, x, y=None, teacher_forcing_ratio=0.0):
        h, c = self.encoder(x)
        h = self.memory(h)
        return self.decoder(h, c, y, teacher_forcing_ratio)

# Initialize model
input_dim = 10
hidden_dim = 512
output_dim = 1
seq_len = 15
pred_len = 5
batch_size = 32
model = Seq2SeqLSTM(input_dim, hidden_dim, output_dim, seq_len, pred_len)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loss, optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model_summary = str(model)
# print(model_summary[:700])  # return partial summary

#loading the data
import numpy as np
data= np.load("Train_Data/option_train_data_scaled.npz")
print(data)
X, Y = data['X_all'], data['Y_all']  # Use the preprocessed data
split_idx = int(len(X) * 0.8)
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_test, Y_test = X[split_idx:], Y[split_idx:]
#Jumbling X_Train and Y_train
perm = np.random.permutation(X_train.shape[0])
X_train, Y_train = X_train[perm], Y_train[perm]
# Convert to tensors
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)
X_train = torch.nan_to_num(X_train)
print("Any NaNs in X_train?", torch.isnan(X_train).any().item())
print("Any NaNs in Y_train?", torch.isnan(Y_train).any().item())
print("Any Infs in X_train?", torch.isinf(X_train).any().item())
print("Any Infs in Y_train?", torch.isinf(Y_train).any().item())


from torch.utils.data import TensorDataset, DataLoader
# 1. Create TensorDataset
train_dataset = TensorDataset(X_train, Y_train)

# 2. Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

def train(model, dataloader, optimizer, loss_fn, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i, (X_batch, Y_batch) in enumerate(dataloader):
            X_batch = X_batch.to(device)
            Y_batch = Y_batch.to(device)

            optimizer.zero_grad()
            output = model(X_batch, Y_batch, teacher_forcing_ratio=1.0)
            loss = loss_fn(output, Y_batch.unsqueeze(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            print(f"Batch {i // batch_size + 1}, Loss: {loss.item():.6f}", end='\r')
        num_batches = len(X_train) // batch_size
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / num_batches:.6f}")
    print("\nTraining complete.\n")
train(model, train_loader, optimizer, criterion, epochs=1000)

In [None]:
import joblib  # or just `import joblib` depending on version
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

test_dataset = TensorDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Load scaler
scaler_Y = joblib.load("Train_Data/scaler_Y.pkl")

model.eval()

with torch.no_grad():
    for i, (X_batch, Y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        preds = model(X_batch)

        # Reshape for unscaling
        preds_np = preds.cpu().numpy().squeeze(-1)
        targets_np = Y_batch.cpu().numpy()  # already (batch_size, pred_len)

        # Unscale
        preds_unscaled = scaler_Y.inverse_transform(preds_np).reshape(Y_batch.shape[0], -1)
        targets_unscaled = scaler_Y.inverse_transform(targets_np).reshape(Y_batch.shape[0], -1)

        # Print a few samples from this batch
        print(f"\n🟦 Batch {i+1}")
        for j in range(min(3, len(preds_unscaled))):  # Only show top 3 per batch for readability
            print(f"Sample {j+1} — True: {targets_unscaled[j]}, Predicted: {preds_unscaled[j]}")


<h3 style="Color:Yellow">Custom Model- Single LSTM for Encoder and Decoder</h3>

In [None]:
import numpy as np
import torch

# Parameters
total_points = 1000
seq_len = 15
pred_len = 5

# Generate sine wave
x = np.linspace(0, 100, total_points)
wave = np.sin(x)  # or use np.cos(x)

# Normalize the wave (optional, but often helpful)
wave = (wave - wave.min()) / (wave.max() - wave.min())

# Prepare sequences
X, Y = [], []
for i in range(len(wave) - seq_len - pred_len):
    X.append(wave[i:i+seq_len])
    Y.append(wave[i+seq_len:i+seq_len+pred_len])

X = np.array(X)
Y = np.array(Y)

# Reshape for model input [batch, seq_len, input_dim]
X = X[..., np.newaxis]  # shape: [N, seq_len, 1]
Y = Y  # shape: [N, pred_len]

print(X.shape, Y.shape)  # e.g. (980, 15, 1), (980, 5)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import joblib
from torch.utils.data import TensorDataset, DataLoader

# ============================
# Encoder
# ============================
class FastEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FastEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True ,num_layers=8, dropout=0.2,bidirectional=True)

    def forward(self, input_seq):
        output, (h, c) = self.lstm(input_seq)
        return h, c  # return last layer's hidden and cell states

# ============================
# Decoder
# ============================
class FastDecoder(nn.Module):
    def __init__(self, hidden_dim, output_dim, pred_len):
        super(FastDecoder, self).__init__()
        self.pred_len = pred_len
        self.lstm = nn.LSTM(output_dim, hidden_dim, batch_first=True ,num_layers=16, dropout=0.2)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, h, c, target_seq=None, teacher_forcing_ratio=0.0):
        batch_size = h.size(1)
        outputs = []
        input_step = torch.zeros(batch_size, 1, 1).to(h.device)  # [batch, 1, output_dim]
        hidden = (h, c)  # pass full hidden state

        for t in range(self.pred_len):
            out, hidden = self.lstm(input_step, hidden)
            prediction = self.output_layer(out[:, -1, :])  # [batch, output_dim]
            outputs.append(prediction.unsqueeze(1))  # [batch, 1, output_dim]

            if target_seq is not None and torch.rand(1).item() < teacher_forcing_ratio:
                input_step = target_seq[:, t].unsqueeze(1).unsqueeze(-1)  # ✅ FIXED
            else:
                input_step = prediction.unsqueeze(1)

        return torch.cat(outputs, dim=1)  # [batch, pred_len, output_dim]

# ============================
# Memory Network
# ============================
class MemoryNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super(MemoryNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, 1024),
            nn.GELU(),
            nn.Linear(1024, 2048),
            nn.GELU(),
            nn.Linear(2048, 4096),
            nn.GELU(),
            nn.Linear(4096, 2048),
            nn.GELU(),
            nn.Linear(2048, hidden_dim),
            nn.LayerNorm(hidden_dim),  # Optional: stabilizes across features
            nn.Tanh()  # For stable, bounded memory to decoder
        )

    def forward(self, h):
        return self.net(h)

# ============================
# Seq2Seq Model
# ============================
class FastSeq2Seq(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len, pred_len):
        super(FastSeq2Seq, self).__init__()
        self.encoder = FastEncoder(input_dim, hidden_dim)
        self.memory = MemoryNetwork(hidden_dim)
        self.decoder = FastDecoder(hidden_dim, output_dim, pred_len)

    def forward(self, x, y=None, teacher_forcing_ratio=0.0):
        h, c = self.encoder(x)  # h, c: [num_layers=5, batch, hidden_dim]

        # Enhance ALL layers of h using MemoryNetwork
        num_layers, batch_size, hidden_dim = h.shape
        h_reshaped = h.view(-1, hidden_dim)                     # [5 * B, H]
        h_enhanced = self.memory(h_reshaped)                    # [5 * B, H]
        h = h_enhanced.view(num_layers, batch_size, hidden_dim) # [5, B, H]

        return self.decoder(h, c, y, teacher_forcing_ratio)

# ============================
# Model and Training Setup
# ============================
input_dim = 10
hidden_dim = 512
output_dim = 1
seq_len = 15
pred_len = 5

model = FastSeq2Seq(input_dim, hidden_dim, output_dim, seq_len, pred_len)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ============================
# Load and Prepare Data
# ============================
X_all_scaled, Y_all_scaled = np.load("Train_Data/option_train_data_scaled.npz", allow_pickle=True).values()
X, Y = X_all_scaled, Y_all_scaled

split_idx = int(len(X) * 0.8)
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_test, Y_test = X[split_idx:], Y[split_idx:]

perm = np.random.permutation(X_train.shape[0])
X_train, Y_train = X_train[perm], Y_train[perm]

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

X_train = torch.nan_to_num(X_train)

train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
scaler = torch.amp.GradScaler()
print("hrwvbr")
# ============================
# Training Function
# ============================

def weighted_mse_loss(pred, target, weights):
    """
    pred, target: [batch, pred_len, output_dim]
    weights: [pred_len] or [1, pred_len, 1]
    """
    return ((weights * (pred - target) ** 2).mean())
def weighted_mae_loss(pred, target, weights):
    return (weights * torch.abs(pred - target)).mean()

def train(model, dataloader, optimizer, weights, epochs=10):
    model.train()
    for epoch in range(epochs):
        teacher_forcing_ratio = max(0, 1.0 - epoch / epochs)
        total_loss = 0
        count = 0  # to count how many batches are being included in total_loss
        for i, (X_batch, Y_batch) in enumerate(dataloader):
            X_batch = X_batch.to(device, non_blocking=True)
            Y_batch = Y_batch.to(device, non_blocking=True)

            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda'):
                output = model(X_batch, Y_batch, teacher_forcing_ratio)  # [B, T, 1]
                # loss = 0.8 * weighted_mae_loss(output, Y_batch.unsqueeze(-1), weights)+ 0.2 * weighted_mse_loss(output, Y_batch.unsqueeze(-1), weights)
                loss= nn.MSELoss()(output, Y_batch.unsqueeze(-1))

            scaler.scale(loss).backward()
            # ✅ Gradient clipping step (place it BEFORE optimizer.step)
            scaler.unscale_(optimizer)  # Unscale gradients for clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            if i >= 50:  # Only count loss after 50th batch
                total_loss += loss.item()
                count += 1
            if i % 50 == 0:
                print(f"Batch {i + 1}, Loss: {loss.item():.6f}", end='\r')
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.6f}")
    print("\n✅ Training complete.\n")

# ============================
# Start Training
# ============================
weights = torch.tensor([0.1, 0.1, 0.3, 0.7, 1.0], device=device).view(1, -1, 1)
train(model, train_loader, optimizer,  weights, epochs=1000)


In [None]:
import joblib  # or just `import joblib` depending on version
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

test_dataset = TensorDataset(X_train, Y_train)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Load scaler
scaler_Y = joblib.load("Train_Data/scaler_Y.pkl")

model.eval()

with torch.no_grad():
    for i, (X_batch, Y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        preds = model(X_batch)

        # Reshape for unscaling
        preds_np = preds.cpu().numpy().squeeze(-1)
        targets_np = Y_batch.cpu().numpy()  # already (batch_size, pred_len)

        # Unscale
        preds_unscaled = scaler_Y.inverse_transform(preds_np).reshape(Y_batch.shape[0], -1)
        targets_unscaled = scaler_Y.inverse_transform(targets_np).reshape(Y_batch.shape[0], -1)

        # Print a few samples from this batch
        print(f"\n🟦 Batch {i+1}")
        for j in range(min(3, len(preds_unscaled))):  # Only show top 3 per batch for readability
            print(f"Sample {j+1} — True: {targets_unscaled[j]}, Predicted: {preds_unscaled[j]}")


<h3 style="color:yellow">Custom model with single attention layer</h3>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))
    
    def forward(self, hidden, encoder_outputs):
        # hidden: [batch, hidden_dim]
        # encoder_outputs: [batch, seq_len, hidden_dim]
        batch_size, seq_len, _ = encoder_outputs.size()

        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)  # [batch, seq_len, hidden_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch, seq_len, hidden_dim]
        energy = energy @ self.v  # [batch, seq_len]
        attn_weights = torch.softmax(energy, dim=1)  # [batch, seq_len]

        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # [batch, 1, hidden_dim]
        return context.squeeze(1), attn_weights  # [batch, hidden_dim], [batch, seq_len]
class MemoryNetwork(nn.Module):
    def __init__(self, hidden_dim):
        super(MemoryNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, hidden_dim)
        )

    def forward(self, h):
        return h + self.net(h)


class FastEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(FastEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=5, dropout=0.2)

    def forward(self, input_seq):
        output, (h, c) = self.lstm(input_seq)
        return output, (h, c)  # output: [batch, seq_len, hidden_dim]

class FastDecoderWithAttention(nn.Module):
    def __init__(self, hidden_dim, output_dim, pred_len):
        super(FastDecoderWithAttention, self).__init__()
        self.pred_len = pred_len
        self.hidden_dim = hidden_dim

        self.attention = Attention(hidden_dim)
        self.lstm_cell = nn.LSTMCell(hidden_dim + output_dim + 1, hidden_dim)  # +1 for time encoding
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, encoder_outputs, h, c, target_seq=None, teacher_forcing_ratio=0.0):
        batch_size = h.size(1)
        h, c = h[-1], c[-1]  # use last layer of stacked LSTM

        outputs = []
        input_step = torch.zeros(batch_size, 1).to(h.device)  # First decoder input is zeros

        for t in range(self.pred_len):
            context, _ = self.attention(h, encoder_outputs)  # context: [batch, hidden_dim]

            # Create time encoding scalar: (t+1)/pred_len
            time_encoding = torch.full((batch_size, 1), (t + 1) / self.pred_len, device=h.device)  # [batch, 1]

            # Concatenate [prev_output, time_encoding, context]
            rnn_input = torch.cat([input_step, time_encoding, context], dim=1)  # [batch, hidden_dim + 2]

            h, c = self.lstm_cell(rnn_input, (h, c))
            prediction = self.output_layer(h)  # [batch, output_dim]
            outputs.append(prediction.unsqueeze(1))  # [batch, 1, output_dim]

            if target_seq is not None and torch.rand(1).item() < teacher_forcing_ratio:
                input_step = target_seq[:, t].unsqueeze(1)  # [batch, 1]
            else:
                input_step = prediction

        return torch.cat(outputs, dim=1)  # [batch, pred_len, output_dim]

class FastSeq2Seq(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len, pred_len):
        super(FastSeq2Seq, self).__init__()
        self.encoder = FastEncoder(input_dim, hidden_dim)
        self.memory = MemoryNetwork(hidden_dim)
        self.decoder = FastDecoderWithAttention(hidden_dim, output_dim, pred_len)

    def forward(self, x, y=None, teacher_forcing_ratio=0.0):
        encoder_outputs, (h, c) = self.encoder(x)  # encoder_outputs: [batch, seq_len, hidden_dim]
        
        # Enhance encoder_outputs through memory
        encoder_outputs = self.memory(encoder_outputs)  # still [batch, seq_len, hidden_dim]
        
        return self.decoder(encoder_outputs, h, c, y, teacher_forcing_ratio)

# ============================
# Model and Training Setup
# ============================
input_dim = 10
hidden_dim = 512
output_dim = 1
seq_len = 15
pred_len = 5

model = FastSeq2Seq(input_dim, hidden_dim, output_dim, seq_len, pred_len)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ============================
# Load and Prepare Data
# ============================
X_all_scaled, Y_all_scaled = np.load("Train_Data/option_train_data_scaled.npz", allow_pickle=True).values()
X, Y = X_all_scaled, Y_all_scaled

split_idx = int(len(X) * 0.8)
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_test, Y_test = X[split_idx:], Y[split_idx:]

perm = np.random.permutation(X_train.shape[0])
X_train, Y_train = X_train[perm], Y_train[perm]

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

X_train = torch.nan_to_num(X_train)

train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
scaler = torch.amp.GradScaler()
print("hrwvbr")
# ============================
# Training Function
# ============================

def weighted_mse_loss(pred, target, weights):
    """
    pred, target: [batch, pred_len, output_dim]
    weights: [pred_len] or [1, pred_len, 1]
    """
    return ((weights * (pred - target) ** 2).mean())
def weighted_mae_loss(pred, target, weights):
    return (weights * torch.abs(pred - target)).mean()

def variance_loss(pred):
    # pred: [batch, pred_len, output_dim]
    var = torch.var(pred, dim=1)  # Variance across timesteps
    return -var.mean()  # Encourage variance

def trend_loss(pred, target):
    # Difference between consecutive steps
    pred_diff = pred[:, 1:, :] - pred[:, :-1, :]
    target_diff = target[:, 1:, :] - target[:, :-1, :]
    return nn.L1Loss()(pred_diff, target_diff)

def composite_loss(pred, target, weights):
    pred = pred.float()
    target = target.float()
    
    mae = weighted_mae_loss(pred, target, weights)
    mse = weighted_mse_loss(pred, target, weights)
    var_penalty = variance_loss(pred)  # Encourage diversity across steps
    tr_loss = trend_loss(pred, target)

    # You can tune these weights
    total = (
        0.4 * mae +
        0.3 * mse +
        0.2 * tr_loss +
        0.1 * var_penalty
    )
    return total

def train(model, dataloader, optimizer, weights, epochs=10):
    model.train()
    for epoch in range(epochs):
        teacher_forcing_ratio = max(0, 1.0 - epoch / epochs)
        total_loss = 0
        for i, (X_batch, Y_batch) in enumerate(dataloader):
            X_batch = X_batch.to(device, non_blocking=True)
            Y_batch = Y_batch.to(device, non_blocking=True)

            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda'):
                output = model(X_batch, Y_batch, teacher_forcing_ratio)  # [B, T, 1]
                loss = composite_loss(output, Y_batch.unsqueeze(-1), weights)
                
            if torch.isnan(loss) or torch.isinf(loss):
                print(f"⚠️ Skipping batch {i+1} due to invalid loss: {loss.item()}")
                continue  # Don't backward/update on invalid gradients
            scaler.scale(loss).backward()
            # ✅ Gradient clipping step (place it BEFORE optimizer.step)
            scaler.unscale_(optimizer)  # Unscale gradients for clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            if i % 50 == 0:
                print(f"Batch {i + 1}, Loss: {loss.item():.6f}", end='\r')
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.6f}")
    print("\n✅ Training complete.\n")

# ============================
# Start Training
# ============================
weights = torch.tensor([0.1, 0.1, 0.3, 0.7, 1.0], device=device).view(1, -1, 1)
train(model, train_loader, optimizer,  weights, epochs=100)



In [None]:
import joblib  # or just `import joblib` depending on version
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

test_dataset = TensorDataset(X_train, Y_train)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Load scaler
scaler_Y = joblib.load("Train_Data/scaler_Y.pkl")

model.eval()

with torch.no_grad():
    for i, (X_batch, Y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        preds = model(X_batch)

        # Reshape for unscaling
        preds_np = preds.cpu().numpy().squeeze(-1)
        targets_np = Y_batch.cpu().numpy()  # already (batch_size, pred_len)

        # Unscale
        preds_unscaled = scaler_Y.inverse_transform(preds_np).reshape(Y_batch.shape[0], -1)
        targets_unscaled = scaler_Y.inverse_transform(targets_np).reshape(Y_batch.shape[0], -1)

        # Print a few samples from this batch
        print(f"\n🟦 Batch {i+1}")
        for j in range(min(3, len(preds_unscaled))):  # Only show top 3 per batch for readability
            print(f"Sample {j+1} — True: {targets_unscaled[j]}, Predicted: {preds_unscaled[j]}")


<h3 style="color:yellow">Attention all you need transformer </h3>

In [None]:
import torch
import torch.nn as nn
import math

# --- Positional Encoding ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # [max_len, d_model]
        position = torch.arange(0, max_len).unsqueeze(1)  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)

        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [batch, seq_len, d_model]
        x = x + self.pe[:, :x.size(1)]
        return x


# --- Transformer Encoder Layer ---
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dim_ff, dropout):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, heads, dropout=dropout, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Self-attention + residual
        attn_output, _ = self.self_attn(src, src, src)
        src = self.norm1(src + self.dropout(attn_output))

        # Feedforward + residual
        ff_output = self.ff(src)
        src = self.norm2(src + self.dropout(ff_output))
        return src


# --- Transformer Decoder Layer ---
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dim_ff, dropout):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, heads, dropout=dropout, batch_first=True)
        self.cross_attn = nn.MultiheadAttention(d_model, heads, dropout=dropout, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory):
        # Self-attention
        attn1, _ = self.self_attn(tgt, tgt, tgt)
        tgt = self.norm1(tgt + self.dropout(attn1))

        # Cross-attention
        attn2, _ = self.cross_attn(tgt, memory, memory)
        tgt = self.norm2(tgt + self.dropout(attn2))

        # Feedforward
        ff = self.ff(tgt)
        tgt = self.norm3(tgt + self.dropout(ff))
        return tgt


# --- Full Transformer Seq2Seq ---
class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim_enc, input_dim_dec, d_model=128, num_layers=6, heads=8, dim_ff=512, dropout=0.1, output_dim=1):
        super().__init__()
        self.encoder_input_proj = nn.Linear(input_dim_enc, d_model)  # For 10 input features
        self.decoder_input_proj = nn.Linear(input_dim_dec, d_model)  # For 1 decoder feature (past price)

        self.pos_enc = PositionalEncoding(d_model)

        self.encoder = nn.ModuleList([
            TransformerEncoderLayer(d_model, heads, dim_ff, dropout)
            for _ in range(num_layers)
        ])
        self.decoder = nn.ModuleList([
            TransformerDecoderLayer(d_model, heads, dim_ff, dropout)
            for _ in range(num_layers)
        ])

        self.output_layer = nn.Linear(d_model, output_dim)

    def forward(self, src, tgt):
        """
        src: [batch, src_len, input_dim_enc=10]
        tgt: [batch, tgt_len, input_dim_dec=1]
        """
        # Project and position encode
        src = self.pos_enc(self.encoder_input_proj(src))  # [B, src_len, d_model]
        tgt = self.pos_enc(self.decoder_input_proj(tgt))  # [B, tgt_len, d_model]

        # Pass through encoder
        for layer in self.encoder:
            src = layer(src)

        # Pass through decoder
        for layer in self.decoder:
            tgt = layer(tgt, src)

        return self.output_layer(tgt)  # [B, tgt_len, output_dim]

# Model Parameters
input_dim_enc = 10    # Input features for encoder (option data)
input_dim_dec = 1     # Input features for decoder (past predicted price or 0)
output_dim = 1        # Final predicted option price
d_model = 128         # Transformer hidden dimension
num_layers = 6        # Number of encoder/decoder layers
heads = 8             # Multi-head attention heads
dim_ff = 512          # Feedforward network dimension
dropout = 0.1         # Dropout probability

# Initialize model
model = TransformerSeq2Seq(
    input_dim_enc=input_dim_enc,
    input_dim_dec=input_dim_dec,
    d_model=d_model,
    num_layers=num_layers,
    heads=heads,
    dim_ff=dim_ff,
    dropout=dropout,
    output_dim=output_dim
)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# === Load Data ===
X_all_scaled, Y_all_scaled = np.load("Train_Data/option_train_data_scaled.npz", allow_pickle=True).values()
X, Y = X_all_scaled, Y_all_scaled

split_idx = int(len(X) * 0.8)
X_train, Y_train = X[:split_idx], Y[:split_idx]
X_test, Y_test = X[split_idx:], Y[split_idx:]

perm = np.random.permutation(X_train.shape[0])
X_train, Y_train = X_train[perm], Y_train[perm]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
Y_train = torch.tensor(Y_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
Y_test = torch.tensor(Y_test, dtype=torch.float32).to(device)

X_train = torch.nan_to_num(X_train)

train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
scaler = torch.amp.GradScaler()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

# === Loss Functions ===
def weighted_mse_loss(pred, target, weights):
    weights = weights.view(1, -1, 1)  # [1, pred_len, 1]
    return ((weights * (pred - target) ** 2).mean())

def weighted_mae_loss(pred, target, weights):
    weights = weights.view(1, -1, 1)
    return (weights * torch.abs(pred - target)).mean()

def variance_loss(pred):
    var = torch.var(pred, dim=1, unbiased=False)
    return -var.mean()

def trend_loss(pred, target):
    pred_diff = pred[:, 1:, :] - pred[:, :-1, :]
    target_diff = target[:, 1:, :] - target[:, :-1, :]
    return nn.L1Loss()(pred_diff, target_diff)

def composite_loss(pred, target, weights):
    pred = pred.float()
    target = target.float()
    
    mae = weighted_mae_loss(pred, target, weights)
    mse = weighted_mse_loss(pred, target, weights)
    var_penalty = variance_loss(pred)
    tr_loss = trend_loss(pred, target)

    return (
        0.4 * mae +
        0.3 * mse +
        0.2 * tr_loss +
        0.1 * var_penalty
    )

# === Training Function ===
def train_transformer(model, dataloader, optimizer, loss_fn, device, epochs=50):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for X_batch, Y_batch in dataloader:
            X_batch = X_batch.to(device).float()
            Y_batch = Y_batch.to(device).float()

            # Decoder input: shifted target
            decoder_input = torch.zeros(Y_batch.size(0), Y_batch.size(1), 1).to(device)
            decoder_input[:, 1:, 0] = Y_batch[:, :-1]

            weights = torch.tensor([0.3, 0.3, 0.5, 0.7, 1.0], device=device)
            output = model(X_batch, decoder_input)  # [B, 5, 1]
            
            # loss_fn expects both to be [B, 5, 1]
            Y_batch = Y_batch.unsqueeze(-1)         # [B, 5, 1]
            loss = loss_fn(output, Y_batch, weights)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            print(f"Batch Loss: {loss.item():.6f}", end='\r')

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader):.6f}")

# === Train ===
train_transformer(model, train_loader, optimizer, loss_fn=composite_loss, device=device, epochs=100)


In [None]:
import joblib  # or just `import joblib` depending on version
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

test_dataset = TensorDataset(X_train, Y_train)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Load scaler
scaler_Y = joblib.load("Train_Data/scaler_Y.pkl")

model.eval()

from sklearn.preprocessing import MinMaxScaler  # if not already imported

with torch.no_grad():
    for i, (X_batch, Y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)

        batch_size = X_batch.size(0)
        pred_len = Y_batch.size(1)  # e.g., 5
        decoder_input = torch.zeros(batch_size, pred_len, 1).to(device)

        # Autoregressive decoding
        for t in range(pred_len):
            # Run model on current decoder input
            output = model(X_batch, decoder_input)

            # Extract the prediction at current timestep
            decoder_input[:, t, 0] = output[:, t, 0]

        # Unscale predictions and targets
        preds_np = decoder_input.cpu().numpy().squeeze(-1)  # [batch, pred_len]
        targets_np = Y_batch.cpu().numpy()  # [batch, pred_len]

        preds_unscaled = scaler_Y.inverse_transform(preds_np)
        targets_unscaled = scaler_Y.inverse_transform(targets_np)

        print(f"\n🟦 Batch {i+1}")
        for j in range(min(3, len(preds_unscaled))):  # Print top 3 per batch
            print(f"Sample {j+1} — True: {targets_unscaled[j]}, Predicted: {preds_unscaled[j]}")


In [None]:
assert preds_np.shape == targets_np.shape


<h3>Training for labelled</h3>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns

# ============================
# Model Parameters
# ============================
input_dim = 10
hidden_dim = 1024
num_layers = 4
dropout = 0.2
output_dim = 2  # "BUY" or "NO"
batch_size = 16
n_heads = 4

# ============================
# FastEncoder with Self-Attention
# ============================
class FastEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, n_heads):
        super(FastEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True,
                            num_layers=num_layers, dropout=dropout, bidirectional=True)
        self.attn = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=n_heads, batch_first=True)
        self.pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # [B, T, 2H]
        attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out)  # [B, T, 2H]
        pooled = self.pool(attn_out.transpose(1, 2)).squeeze(-1)  # [B, 2H]
        return pooled

# ============================
# Memory Network
# ============================
class MemoryNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MemoryNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim)
        )

    def forward(self, h):
        return self.net(h)

# ============================
# Classifier Head
# ============================
class ClassifierHead(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ClassifierHead, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# ============================
# Full Classification Model
# ============================
class FastClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, output_dim, n_heads):
        super(FastClassifier, self).__init__()
        self.encoder = FastEncoder(input_dim, hidden_dim, num_layers, dropout, n_heads)
        self.memory = MemoryNetwork(hidden_dim * 2, hidden_dim * 2)
        self.classifier = ClassifierHead(hidden_dim * 2, output_dim)

    def forward(self, x):
        h = self.encoder(x)
        h = self.memory(h)
        return self.classifier(h)

# ============================
# Load and Prepare Data
# ============================
X_all, Y_all, label = np.load("Train_Data/option_train_labeled_3m.npz", allow_pickle=True).values()
label_map = {'NO': 0, 'BUY': 1}
Y_numeric = np.array([label_map[y] for y in label])

split_idx = int(len(X_all) * 0.8)
X_train, Y_train = X_all[:split_idx], Y_numeric[:split_idx]
X_test, Y_test = X_all[split_idx:], Y_numeric[split_idx:]

X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test, dtype=torch.long)

X_train = torch.nan_to_num(X_train)

train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ============================
# Training Setup
# ============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FastClassifier(input_dim, hidden_dim, num_layers, dropout, output_dim, n_heads).to(device)

# Compute class weights for imbalanced classification
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=Y_numeric)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
scaler = torch.amp.GradScaler()

# ============================
# Training Function
# ============================
def train_model(model, loader, optimizer, criterion, test_loader, device, epochs=20):
    best_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
                output = model(X_batch)
                loss = criterion(output, y_batch)

            if torch.isnan(loss) or torch.isinf(loss):
                print(f"⚠️ Skipping batch due to invalid loss: {loss.item()}")
                continue

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            preds = torch.argmax(output, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        # Evaluate test set
        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0

        with torch.no_grad():
            for X_batch, Y_batch in test_loader:
                X_batch = X_batch.to(device)
                Y_batch = Y_batch.to(device)

                preds = model(X_batch)
                loss = criterion(preds, Y_batch)
                test_loss += loss.item()

                predicted_classes = torch.argmax(preds, dim=1)
                test_correct += (predicted_classes == Y_batch).sum().item()
                test_total += Y_batch.size(0)

        avg_test_loss = test_loss / len(test_loader)
        test_accuracy = 100.0 * test_correct / test_total

        if avg_test_loss > best_loss:
            print("⛔ Early stopping: Test loss increased.")
            break
        else:
            best_loss = avg_test_loss

        train_acc = 100.0 * correct / total
        print(f"Epoch {epoch + 1}/{epochs} | Train Loss: {total_loss / len(loader):.6f} | Train Acc: {train_acc:.2f}% | Test Acc: {test_accuracy:.2f}%")

# ============================
# Confusion Matrix Plot
# ============================
def plot_confusion(model, test_loader):
    model.eval()
    all_preds = []
    all_true = []

    with torch.no_grad():
        for X_batch, Y_batch in test_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch)
            preds = torch.argmax(preds, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_true.extend(Y_batch.numpy())

    cm = confusion_matrix(all_true, all_preds)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["NO", "BUY"], yticklabels=["NO", "BUY"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# ============================
# Train and Evaluate
# ============================
train_model(model, train_loader, optimizer, criterion, test_loader, device, epochs=20)
plot_confusion(model, test_loader)

In [None]:
plot_confusion(model, test_loader)


In [None]:
def train(model, train_loader, val_loader, optimizer, loss_fn, device, epochs=20, patience=3):
    model.train()
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        total_loss = 0.0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch = torch.nan_to_num(X_batch).to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        train_loss = total_loss / len(train_loader)
        train_acc = 100 * correct / total

        # === Validation loss ===
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val = torch.nan_to_num(X_val).to(device)
                y_val = y_val.to(device)
                val_outputs = model(X_val)
                val_loss += loss_fn(val_outputs, y_val).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}%, Val Loss: {val_loss:.4f}")

        # === Early Stopping Check ===
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
        model.train()

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            outputs = model(X_batch)
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    acc = 100. * correct / total
    print(f"Test Accuracy: {acc:.2f}%")
train(model, train_loader,test_loader, optimizer,loss_fn= criterion, device='cuda', epochs=10)
evaluate(model, test_loader, device)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# === Evaluate on Test Set ===
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in DataLoader(TensorDataset(X_test, y_test), batch_size=512):
        X_batch = torch.nan_to_num(X_batch).to(device)
        y_batch = y_batch.to(device)

        outputs = model(X_batch)  # [B, num_classes]
        preds = outputs.argmax(dim=1)

        all_preds.append(preds.cpu().numpy())
        all_targets.append(y_batch.cpu().numpy())

# === Flatten Arrays ===
all_preds = np.concatenate(all_preds)
all_targets = np.concatenate(all_targets)

# === Confusion Matrix ===
cm = confusion_matrix(all_targets, all_preds)
labels = ['NO', 'BUY']  # Adjust if your label order is different

# === Plot Confusion Matrix ===
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# === Optional: Print classification report ===
print(classification_report(all_targets, all_preds, target_names=labels))
