In [1]:
!python --version


Python 3.12.0


In [15]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from ta.momentum import RSIIndicator
from ta.trend import MACD, ADXIndicator
from ta.volatility import AverageTrueRange, BollingerBands
from ta.volume import OnBalanceVolumeIndicator
from sklearn.preprocessing import StandardScaler
import pickle
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


In [10]:


def clean_price_column(col):
    """
    Clean a price column by removing '$' and converting to float.
    
    Args:
        col (pd.Series): Column to clean
    
    Returns:
        pd.Series: Cleaned numeric column
    """
    return col.str.replace('$', '', regex=False).astype(float)

def process_stock_data(df):
    """
    Process a single stock's data to create sequences and targets.
    
    Args:
        df (pd.DataFrame): Stock data with columns ['Date', 'Close/Last', 'Volume', 'Open', 'High', 'Low']
    
    Returns:
        tuple: (sequences, y, dates)
            - sequences: List of arrays with shape [timesteps, num_features]
            - y: List of percent changes (clipped to [-8, 8])
            - dates: List of dates for the last day in each sequence
    """
    # Clean and convert price columns to numeric
    price_cols = ['Close/Last', 'Open', 'High', 'Low']
    for col in price_cols:
        df[col] = clean_price_column(df[col])
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    
    # Take from row 29 (30th oldest, 0-based index) to end
    df = df.iloc[29:].copy()
    
    # Check if sufficient data is available for MACD (26 periods)
    if len(df) < 26:
        print(f"Insufficient data for stock: {len(df)} rows. Skipping.")
        return [], [], []
    
    # Calculate technical indicators
    df['RSI_7'] = RSIIndicator(df['Close/Last'], window=7).rsi()
    macd = MACD(df['Close/Last'], window_slow=26, window_fast=12)
    df['MACD'] = macd.macd()
    adx_window = min(14, len(df) - 1)
    adx = ADXIndicator(df['High'], df['Low'], df['Close/Last'], window=adx_window)
    df['ADX_14'] = adx.adx()
    atr = AverageTrueRange(df['High'], df['Low'], df['Close/Last'], window=14)
    df['ATR_14'] = atr.average_true_range()
    obv = OnBalanceVolumeIndicator(df['Close/Last'], df['Volume'])
    df['OBV'] = obv.on_balance_volume()
    bb = BollingerBands(df['Close/Last'], window=20)
    df['BB_upper'] = bb.bollinger_hband()
    df['BB_lower'] = bb.bollinger_lband()
    
    # Drop rows with NaN indicators
    df = df.dropna().reset_index(drop=True)
    
    # Skip another 20 rows if possible
    if len(df) < 20:
        print(f"Insufficient data after dropping NaNs: {len(df)} rows. Skipping stock.")
        return [], [], []
    df = df.iloc[20:].copy().reset_index(drop=True)
    
    # Define features
    features = ['Close/Last', 'Volume', 'Open', 'High', 'Low', 'RSI_7', 'MACD', 'ADX_14', 'ATR_14', 'OBV', 'BB_upper', 'BB_lower']
    num_features = len(features)
    rsi_index = features.index('RSI_7')
    
    # Create sequences
    sequences = []
    y = []
    dates = []
    for i in range(len(df) - 20):
        seq = df.iloc[i:i+20]
        seq_features = seq[features].values  # Shape: [timesteps, num_features]
        # Scale features except RSI_7 within the sequence
        scaler = StandardScaler()
        seq_scaled = scaler.fit_transform(seq_features)
        seq_scaled[:, rsi_index] = seq_features[:, rsi_index]  # Preserve RSI_7
        # Keep shape as [timesteps, num_features]
        sequences.append(seq_scaled)
        
        # Calculate y: percent change
        today_close = df.iloc[i+19]['Close/Last']
        tomorrow_close = df.iloc[i+20]['Close/Last']
        percent_change = 100 * (tomorrow_close - today_close) / today_close
        # Clip percent change to [-8, 8]
        percent_change = np.clip(percent_change, -8, 8)
        y.append(percent_change)
        
        # Store date of the last day in the sequence
        dates.append(df.iloc[i+19]['Date'])
    
    return sequences, y, dates

def load_and_process_stock_data():
    """
    Load and process stock data for multiple companies, aggregating sequences and targets.
    
    Returns:
        tuple: (X, y)
            - X: Array of shape [num_sequences, timesteps, num_features]
            - y: Array of shape [num_sequences], scaled with StandardScaler
    
    Saves:
        x_and_y.pkl: Pickle file containing (X, y)
        target_scaler.pkl: Pickle file containing the StandardScaler for y
    """
    stock_files = [
        "amzn_data.csv", "nflx_data.csv", "tsla_data.csv", "aapl_data.csv",
        "qcom_data.csv", "msft_data.csv", "sbux_data.csv", "csco_data.csv", "meta_data.csv"
    ]
    all_data = []
    
    for stock_file in stock_files:
        file_path = "./rawData/"+stock_file
        df = pd.read_csv(file_path)
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values('Date').reset_index(drop=True)
        sequences, y, dates = process_stock_data(df)
        if sequences:
            for seq, yy, dd in zip(sequences, y, dates):
                all_data.append((seq, yy, dd))
        else:
            print(f"No sequences generated for {stock_file}")
    
    if not all_data:
        raise ValueError("No valid sequences generated from any stock data.")
    
    # Sort by date to maintain temporal order
    all_data.sort(key=lambda x: x[2])
    
    # Extract X and y
    X = np.array([item[0] for item in all_data])  # Shape: [num_sequences, timesteps, num_features]
    y = np.array([item[1] for item in all_data])  # Shape: [num_sequences]
    
    # Scale the target values
    target_scaler = StandardScaler()
    y = target_scaler.fit_transform(y.reshape(-1, 1)).flatten()  # Reshape for scaler, then flatten back
    
    # Save X, y, and target scaler
    with open("x_and_y.pkl", "wb") as f:
        pickle.dump((X, y), f)
    with open("target_scaler.pkl", "wb") as f:
        pickle.dump(target_scaler, f)
    
    return X, y


In [4]:
X,y = load_and_process_stock_data()

Insufficient data for stock: 0 rows. Skipping.
No sequences generated for qcom_data.csv


In [8]:
X.shape,y.shape

((19376, 20, 12), (19376,))

In [16]:
# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)  # For NumPy (used by scikit-learn)
torch.manual_seed(SEED)  # For PyTorch
torch.cuda.manual_seed_all(SEED)  # For CUDA (if using GPU)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [17]:
# Split data with reproducible shuffling
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False, random_state=SEED)

def make_loader(X, y, bs=32, shuffle=False):
    ds = TensorDataset(torch.tensor(X, dtype=torch.float32).to(device),
                       torch.tensor(y, dtype=torch.float32).to(device))
    return DataLoader(ds, batch_size=bs, shuffle=shuffle)

# Create data loaders
train_loader = make_loader(X_train, y_train, shuffle=True)
val_loader = make_loader(X_val, y_val)
test_loader = make_loader(X_test, y_test)

In [18]:

# Collect all target values from test_loader
all_targets = []
for _, targets in test_loader:
    all_targets.append(targets.cpu().numpy())
all_targets = np.concatenate(all_targets)

# Compute statistics
mean = np.mean(all_targets)
median = np.median(all_targets)
std = np.std(all_targets)
min_val = np.min(all_targets)
max_val = np.max(all_targets)
num_values = len(all_targets)

# Print results
print("Test DataLoader Target Statistics:")
print(f"Number of Values: {num_values}")
print(f"Mean: {mean:.6f}")
print(f"Median: {median:.6f}")
print(f"Standard Deviation: {std:.6f}")
print(f"Minimum: {min_val:.6f}")
print(f"Maximum: {max_val:.6f}")

Test DataLoader Target Statistics:
Number of Values: 2907
Mean: -0.004569
Median: -0.000535
Standard Deviation: 0.956245
Minimum: -3.819506
Maximum: 3.733095


In [None]:
# Define LSTM_Linear model
class LSTM_Linear(nn.Module):
    """
    LSTM model with a linear output layer.
    
    Args:
        input_size: Number of input features
        hidden_size: Number of hidden units
        output_size: Number of output units
        num_layers: Number of LSTM layers
        dropout_rate: Dropout probability
    """
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout_rate=0.3):
        super(LSTM_Linear, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.linear = nn.Linear(hidden_size, output_size).to(device)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])
        out = self.linear(out)
        return out

# Define LSTM_NdLinear model with multi-dimensional hidden layers
class LSTM_NdLinear(nn.Module):
    """
    LSTM model with an NdLinear output layer.
    
    Args:
        input_size: Number of input features
        hidden_size: Number of hidden units
        output_size: Number of output units
        nd_hidden: Hidden dimensions for NdLinear
        num_layers: Number of LSTM layers
        dropout_rate: Dropout probability
    """
    def __init__(self, input_size, hidden_size, output_size, nd_hidden=(64, 32), num_layers=2, dropout_rate=0.3):
        super(LSTM_NdLinear, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.ndlinear = NdLinear(input_dims=(hidden_size, 1), hidden_size=nd_hidden).to(device)
        self.linear = nn.Linear(np.prod(nd_hidden), output_size).to(device)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_out = lstm_out[:, -1, :].unsqueeze(-1)
        nd_out = self.ndlinear(last_out)
        nd_out = self.dropout(nd_out.view(nd_out.size(0), -1))
        out = self.linear(nd_out)
        return out

# Training and evaluation function
def train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, epochs=50):
    """
    Train and evaluate a model with MSE loss.
    
    Args:
        model: Model to train
        train_loader: Training data loader
        val_loader: Validation data loader
        test_loader: Test data loader
        criterion: Loss function (MSELoss)
        optimizer: Optimizer
        epochs: Number of training epochs
    
    Returns:
        Tuple of (train_losses, val_losses, test_loss)
    """
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                output = model(X_batch)
                loss = criterion(output.squeeze(), y_batch)
                val_loss += loss.item()
            val_loss /= len(val_loader)
            val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"Test Loss: {test_loss:.4f}")
    return train_losses, val_losses, test_loss

# Initialize models
input_size = 12  # Number of features
hidden_size = 128
output_size = 1
nd_hidden = (64, 32)  # Multi-dimensional hidden layer for NdLinear

lstm_linear = LSTM_Linear(input_size, hidden_size, output_size).to(device)
lstm_ndlinear = LSTM_NdLinear(input_size, hidden_size, output_size, nd_hidden).to(device)

# Define loss and optimizers
criterion = nn.MSELoss().to(device)
optimizer_lstm_linear = optim.Adam(lstm_linear.parameters(), lr=0.001)
optimizer_lstm_ndlinear = optim.Adam(lstm_ndlinear.parameters(), lr=0.001)

# Train and evaluate LSTM_Linear
print("Training LSTM_Linear...")
train_losses_lstm_linear, val_losses_lstm_linear, test_floss_lstm_linear = train_and_evaluate(
    lstm_linear, train_loader, val_loader, test_loader, criterion, optimizer_lstm_linear
)

# Train and evaluate LSTM_NdLinear
print("Training LSTM_NdLinear...")
train_losses_lstm_ndlinear, val_losses_lstm_ndlinear, test_loss_lstm_ndlinear = train_and_evaluate(
    lstm_ndlinear, train_loader, val_loader, test_loader, criterion, optimizer_lstm_ndlinear
)

In [None]:
# Define RNN_Linear model
class RNN_Linear(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout_rate=0.3):
        super(RNN_Linear, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.linear = nn.Linear(hidden_size, output_size).to(device)

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        out = self.dropout(rnn_out[:, -1, :])
        out = self.linear(out)
        return out

# Define RNN_NdLinear model with multi-dimensional hidden layers
class RNN_NdLinear(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, nd_hidden=(64, 32), num_layers=2, dropout_rate=0.3):
        super(RNN_NdLinear, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.ndlinear = NdLinear(input_dims=(hidden_size, 1), hidden_size=nd_hidden).to(device)
        self.linear = nn.Linear(np.prod(nd_hidden), output_size).to(device)

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        last_out = rnn_out[:, -1, :].unsqueeze(-1)
        nd_out = self.ndlinear(last_out)
        nd_out = self.dropout(nd_out.view(nd_out.size(0), -1))
        out = self.linear(nd_out)
        return out

# Training and evaluation function
def train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, epochs=50):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                output = model(X_batch)
                loss = criterion(output.squeeze(), y_batch)
                val_loss += loss.item()
            val_loss /= len(val_loader)
            val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"Test Loss: {test_loss:.4f}")
    return train_losses, val_losses, test_loss

# Initialize models
input_size = 12  # Number of features
hidden_size = 128
output_size = 1
nd_hidden = (64, 32)  # Multi-dimensional hidden layer for NdLinear

rnn_linear = RNN_Linear(input_size, hidden_size, output_size).to(device)
rnn_ndlinear = RNN_NdLinear(input_size, hidden_size, output_size, nd_hidden).to(device)

# Define loss and optimizers
criterion = nn.MSELoss().to(device)
optimizer_rnn_linear = optim.Adam(rnn_linear.parameters(), lr=0.001)
optimizer_rnn_ndlinear = optim.Adam(rnn_ndlinear.parameters(), lr=0.001)

# Train and evaluate RNN_Linear
print("Training RNN_Linear...")
train_losses_rnn_linear, val_losses_rnn_linear, test_loss_rnn_linear = train_and_evaluate(
    rnn_linear, train_loader, val_loader, test_loader, criterion, optimizer_rnn_linear
)

# Train and evaluate RNN_NdLinear
print("Training RNN_NdLinear...")
train_losses_rnn_ndlinear, val_losses_rnn_ndlinear, test_loss_rnn_ndlinear = train_and_evaluate(
    rnn_ndlinear, train_loader, val_loader, test_loader, criterion, optimizer_rnn_ndlinear
)

In [None]:
# Define TCN block
class TCN(nn.Module):
    def __init__(self, input_size, hidden_size, kernel_size=3):
        super(TCN, self).__init__()
        padding = (kernel_size - 1) // 2
        self.conv = nn.Conv1d(input_size, hidden_size, kernel_size=kernel_size, padding=padding).to(device)
        self.relu = nn.ReLU().to(device)

    def forward(self, x):
        x = x.transpose(1, 2)  # [batch, features, timesteps]
        out = self.conv(x)
        out = self.relu(out)
        out = out.transpose(1, 2)  # [batch, timesteps, hidden_size]
        return out

# Define TCN_Linear model
class TCN_Linear(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.3):
        super(TCN_Linear, self).__init__()
        self.tcn = TCN(input_size, hidden_size).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.linear = nn.Linear(hidden_size, output_size).to(device)

    def forward(self, x):
        tcn_out = self.tcn(x)
        out = self.dropout(tcn_out[:, -1, :])
        out = self.linear(out)
        return out

# Define TCN_NdLinear model with multi-dimensional hidden layers
class TCN_NdLinear(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, nd_hidden=(64, 32), dropout_rate=0.3):
        super(TCN_NdLinear, self).__init__()
        self.tcn = TCN(input_size, hidden_size).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.ndlinear = NdLinear(input_dims=(hidden_size, 1), hidden_size=nd_hidden).to(device)
        self.linear = nn.Linear(np.prod(nd_hidden), output_size).to(device)

    def forward(self, x):
        tcn_out = self.tcn(x)
        last_out = tcn_out[:, -1, :].unsqueeze(-1)
        nd_out = self.ndlinear(last_out)
        nd_out = self.dropout(nd_out.view(nd_out.size(0), -1))
        out = self.linear(nd_out)
        return out

# Training and evaluation function
def train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, epochs=50):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                output = model(X_batch)
                loss = criterion(output.squeeze(), y_batch)
                val_loss += loss.item()
            val_loss /= len(val_loader)
            val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"Test Loss: {test_loss:.4f}")
    return train_losses, val_losses, test_loss

# Initialize models
input_size = 12  # Number of features
hidden_size = 128
output_size = 1
nd_hidden = (64, 32)  # Multi-dimensional hidden layer for NdLinear

tcn_linear = TCN_Linear(input_size, hidden_size, output_size).to(device)
tcn_ndlinear = TCN_NdLinear(input_size, hidden_size, output_size, nd_hidden).to(device)

# Define loss and optimizers
criterion = nn.MSELoss().to(device)
optimizer_tcn_linear = optim.Adam(tcn_linear.parameters(), lr=0.001)
optimizer_tcn_ndlinear = optim.Adam(tcn_ndlinear.parameters(), lr=0.001)

# Train and evaluate TCN_Linear
print("Training TCN_Linear...")
train_losses_tcn_linear, val_losses_tcn_linear, test_loss_tcn_linear = train_and_evaluate(
    tcn_linear, train_loader, val_loader, test_loader, criterion, optimizer_tcn_linear
)

# Train and evaluate TCN_NdLinear
print("Training TCN_NdLinear...")
train_losses_tcn_ndlinear, val_losses_tcn_ndlinear, test_loss_tcn_ndlinear = train_and_evaluate(
    tcn_ndlinear, train_loader, val_loader, test_loader, criterion, optimizer_tcn_ndlinear
)

In [None]:
# NdLinear-based Model
class NdFeedForward(nn.Module):
    """
    NdLinear-based feedforward network for transformer, adapted from ts_forecast.py.
    
    Args:
        input_dim: Input dimension
        hidden_dim: Multi-dimensional hidden size for NdLinear (e.g., (256, 1))
        dropout: Dropout rate
        activation: Activation function
    """
    def __init__(self, input_dim, hidden_dim=(128, 128), dropout=0.1, activation='gelu'):
        super(NdFeedForward, self).__init__()
        activations = {
            'relu': nn.ReLU().to(device),
            'tanh': nn.Tanh().to(device),
            'sigmoid': nn.Sigmoid().to(device),
            'gelu': nn.GELU().to(device),
        }
        if activation not in activations:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.layer1 = NdLinear((input_dim, 1), hidden_dim).to(device)
        self.activation = activations[activation]
        self.dropout = nn.Dropout(dropout).to(device)
        self.layer2 = NdLinear(hidden_dim, (input_dim, 1)).to(device)

    def forward(self, x):
        x_dims = list(x.shape)
        x = x.reshape(x_dims[0] * x_dims[1], x_dims[2], 1)
        x = self.layer1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = x.reshape(x_dims[0], x_dims[1], x_dims[2])
        return x

class NdTransformerEncoderLayer(nn.TransformerEncoderLayer):
    """
    Transformer encoder layer with NdLinear feedforward, adapted from ts_forecast.py.
    
    Args:
        d_model: Model dimension
        nhead: Number of attention heads
        custom_ffn: NdFeedForward instance
    """
    def __init__(self, d_model, nhead, custom_ffn, **kwargs):
        kwargs['batch_first'] = True
        super(NdTransformerEncoderLayer, self).__init__(d_model, nhead, **kwargs)
        self.nd_ffn = custom_ffn.to(device)

    def _ff_block(self, x: Tensor) -> Tensor:
        x = self.nd_ffn(x)
        return x

class Transformer_NdLinear(nn.Module):
    """
    Transformer model with NdLinear feedforward for stock price prediction.
    
    Args:
        input_dim: Number of input features (12)
        model_dim: Transformer model dimension
        num_heads: Number of attention heads
        num_layers: Number of transformer layers
        hidden_dim: Hidden dimension for NdFeedForward
        dropout: Dropout rate
        activation: Activation function
    """
    def __init__(self, input_dim, model_dim=64, num_heads=2, num_layers=3, hidden_dim=(128, 128), dropout=0.1, activation='gelu'):
        super(Transformer_NdLinear, self).__init__()
        ff_layer = NdFeedForward(model_dim, hidden_dim=hidden_dim, dropout=dropout, activation=activation).to(device)
        encoder_layer = NdTransformerEncoderLayer(model_dim, num_heads, custom_ffn=ff_layer, dropout=dropout).to(device)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers).to(device)
        self.embedding = nn.Linear(input_dim, model_dim).to(device)
        self.fc_out = nn.Linear(model_dim, 1).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, src):
        src = self.embedding(src)
        output = self.transformer_encoder(src)
        output = output[:, -1, :]  # Take the last timestep
        output = self.dropout(output)
        output = self.fc_out(output)
        return output

# Standard Linear-based Model
class LinearFeedForward(nn.Module):
    """
    Standard feedforward network with Linear layers.
    
    Args:
        input_dim: Input dimension
        hidden_dim: Hidden dimension for Linear layers
        dropout: Dropout rate
        activation: Activation function
    """
    def __init__(self, input_dim, hidden_dim=256, dropout=0.1, activation='gelu'):
        super(LinearFeedForward, self).__init__()
        activations = {
            'relu': nn.ReLU().to(device),
            'tanh': nn.Tanh().to(device),
            'sigmoid': nn.Sigmoid().to(device),
            'gelu': nn.GELU().to(device),
        }
        if activation not in activations:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.layer1 = nn.Linear(input_dim, hidden_dim).to(device)
        self.activation = activations[activation]
        self.dropout = nn.Dropout(dropout).to(device)
        self.layer2 = nn.Linear(hidden_dim, input_dim).to(device)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

class Transformer_Linear(nn.Module):
    """
    Transformer model with standard Linear feedforward for stock price prediction.
    
    Args:
        input_dim: Number of input features (12)
        model_dim: Transformer model dimension
        num_heads: Number of attention heads
        num_layers: Number of transformer layers
        hidden_dim: Hidden dimension for FeedForward
        dropout: Dropout rate
        activation: Activation function
    """
    def __init__(self, input_dim, model_dim=64, num_heads=2, num_layers=3, hidden_dim=256, dropout=0.1, activation='gelu'):
        super(Transformer_Linear, self).__init__()
        ff_layer = LinearFeedForward(model_dim, hidden_dim=hidden_dim, dropout=dropout, activation=activation).to(device)
        encoder_layer = nn.TransformerEncoderLayer(model_dim, num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True).to(device)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers).to(device)
        self.embedding = nn.Linear(input_dim, model_dim).to(device)
        self.fc_out = nn.Linear(model_dim, 1).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, src):
        src = self.embedding(src)
        output = self.transformer_encoder(src)
        output = output[:, -1, :]  # Take the last timestep
        output = self.dropout(output)
        output = self.fc_out(output)
        return output

# Training and evaluation function
def train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, epochs=50, model_name="Model"):
    """
    Train and evaluate the model with MSE loss and mixed precision.
    
    Args:
        model: Transformer model (NdLinear or Linear)
        train_loader: Training data loader
        val_loader: Validation data loader
        test_loader: Test data loader
        criterion: Loss function (MSELoss)
        optimizer: Optimizer
        epochs: Number of training epochs
        model_name: Name of the model for logging
    
    Returns:
        Tuple of (train_losses, val_losses, test_loss)
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    scaler = torch.cuda.amp.GradScaler()
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for inputs, targets in tqdm(train_loader, desc=f'Training {model_name} Epoch {epoch+1}/{epochs}', leave=False):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        val_outputs = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()
                val_outputs.append(outputs.squeeze().cpu().numpy())
            val_loss /= len(val_loader)
            val_outputs = np.concatenate(val_outputs)
            output_std = np.std(val_outputs)
            print(f"{model_name} Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
                  f"Val Output Std: {output_std:.6f}, Min Val Pred: {val_outputs.min():.6f}, Max Val Pred: {val_outputs.max():.6f}")
        
        val_losses.append(val_loss)
    
    model.eval()
    test_loss = 0
    test_outputs = []
    test_targets = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            test_loss += loss.item()
            test_outputs.append(outputs.squeeze().cpu().numpy())
            test_targets.append(targets.cpu().numpy())
        test_loss /= len(test_loader)
        test_outputs = np.concatenate(test_outputs)
        test_targets = np.concatenate(test_targets)
        print(f"{model_name} Test Loss: {test_loss:.4f}")
        print(f"{model_name} Test Outputs: Min: {test_outputs.min():.6f}, Max: {test_outputs.max():.6f}, Std: {np.std(test_outputs):.6f}")
        print(f"{model_name} Test Targets: Min: {test_targets.min():.6f}, Max: {test_targets.max():.6f}, Std: {np.std(test_targets):.6f}")
    
    return train_losses, val_losses, test_loss

# Initialize and train NdLinear model
input_dim = 12  # Number of features
model_ndlinear = Transformer_NdLinear(input_dim, model_dim=64, num_heads=2, num_layers=3, hidden_dim=(128,128), dropout=0.1, activation='gelu').to(device)
optimizer_ndlinear = optim.Adam(model_ndlinear.parameters(), lr=0.0001)
criterion = nn.MSELoss().to(device)
print("Training Transformer_NdLinear...")
train_losses_ndlinear, val_losses_ndlinear, test_loss_ndlinear = train_and_evaluate(
    model_ndlinear, train_loader, val_loader, test_loader, criterion, optimizer_ndlinear, epochs=50, model_name="Transformer_NdLinear"
)

# Save NdLinear model
torch.save(model_ndlinear.state_dict(), 'transformer_ndlinear.pth')

# Initialize and train Linear model
model_linear = Transformer_Linear(input_dim, model_dim=64, num_heads=2, num_layers=3, hidden_dim=256, dropout=0.1, activation='gelu').to(device)
optimizer_linear = optim.Adam(model_linear.parameters(), lr=0.0001)
print("Training Transformer_Linear...")
train_losses_linear, val_losses_linear, test_loss_linear = train_and_evaluate(
    model_linear, train_loader, val_loader, test_loader, criterion, optimizer_linear, epochs=50, model_name="Transformer_Linear"
)

# Save Linear model
torch.save(model_linear.state_dict(), 'transformer_linear.pth')

In [None]:
def simulate_trading(model, dataloader, target_scaler_path="./rawData/target_scaler.pkl", initial_cash=10000, transaction_cost=0.001):
    """
    Simulate trading with scaled allocation for positive predictions and inverse positions for negative predictions.
    For negative predictions, sells long position before buying inverse position. Handles scaled targets by descaling.
    
    Args:
        model: Trained model (LSTM_Linear, LSTM_NdLinear, RNN, TCN, or Transformer_NdLinear)
        dataloader: PyTorch DataLoader with input data [batch_size, timesteps, num_features] and scaled targets [batch_size]
        target_scaler_path: Path to the saved StandardScaler for targets (default: 'target_scaler.pkl')
        initial_cash: Starting capital (default: $10,000)
        transaction_cost: Cost per trade as a fraction (default: 0.1%)
    
    Returns:
        dict: Contains portfolio_values, returns, and sharpe_ratio
    """
    # Load the target scaler
    with open(target_scaler_path, "rb") as f:
        target_scaler = pickle.load(f)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    cash = initial_cash
    position = 0  # Number of units held (positive for long or inverse positions)
    is_inverse = False  # Flag to track if position is inverse
    portfolio_values = [initial_cash]
    returns = []
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)  # inputs: [batch_size, timesteps, num_features], targets: [batch_size]
            
            # Get model predictions for the batch
            preds = model(inputs).squeeze()  # Shape: [batch_size]
            
            # Inverse-transform predictions and targets to percent changes
            preds_np = preds.cpu().numpy().reshape(-1, 1)
            targets_np = targets.cpu().numpy().reshape(-1, 1)
            preds_percent = target_scaler.inverse_transform(preds_np).flatten()  # Shape: [batch_size]
            targets_percent = target_scaler.inverse_transform(targets_np).flatten()  # Shape: [batch_size]
            
            # Process each sample in the batch
            for pred, actual_pct in zip(preds_percent, targets_percent):
                # Trading logic
                allocation = 0.0
                if pred >= 1:
                    # Positive predictions: Linear scaling between 1% and 2%
                    abs_pred = abs(pred)
                    if abs_pred >= 2:
                        allocation = 1.0  # 100% allocation for predictions >= 2%
                    else:
                        allocation = 0.5 * (abs_pred - 1) + 0.5  # 1% -> 50%, 2% -> 100%
                    target_is_inverse = False
                elif pred < 0:
                    # Negative predictions: First sell any existing position
                    if position > 0:
                        if is_inverse:
                            # Sell inverse position
                            position_value = position / (1 + actual_pct / 100)  # Inverse position gains when price falls
                            cash += position_value * (1 - transaction_cost)
                        else:
                            # Sell long position
                            cash += position * (1 + actual_pct / 100) * (1 - transaction_cost)
                        position = 0
                        is_inverse = False
                    
                    # Then check for inverse position if prediction is below -0.5%
                    if pred < -0.5:
                        abs_pred = abs(pred)
                        if abs_pred >= 1.5:
                            allocation = 1.0  # 100% allocation for predictions <= -1.5%
                        elif abs_pred >= 0.5:
                            # Linear scaling: -0.5% -> 0%, -1% -> 50%, -1.5% -> 100%
                            allocation = 0.5 * (abs_pred - 0.5) / (1.5 - 0.5)
                        target_is_inverse = True
                    else:
                        target_is_inverse = False
                else:
                    target_is_inverse = False
                
                if pred > 0 and cash > 0 and allocation > 0:
                    # Buy long position: Invest allocation fraction of cash
                    invest_amount = cash * allocation
                    shares_to_buy = invest_amount / (1 + actual_pct / 100) / (1 + transaction_cost)
                    position = shares_to_buy
                    cash -= invest_amount
                    is_inverse = False
                elif pred < -0.5 and cash > 0 and allocation > 0:
                    # Buy inverse position: Invest allocation fraction of cash
                    invest_amount = cash * allocation
                    units_to_buy = invest_amount * (1 + actual_pct / 100) / (1 + transaction_cost)  # Inverse units scale with price
                    position = units_to_buy
                    cash -= invest_amount
                    is_inverse = True
                
                # Update portfolio value
                if is_inverse:
                    # Inverse position: Gains when price falls
                    position_value = position / (1 + actual_pct / 100)
                else:
                    # Long position: Gains when price rises
                    position_value = position * (1 + actual_pct / 100)
                portfolio_value = cash + position_value
                # Prevent negative portfolio value
                portfolio_value = max(0, portfolio_value)
                portfolio_values.append(portfolio_value)
                
                # Calculate daily return
                daily_return = (portfolio_value - portfolio_values[-2]) / portfolio_values[-2]
                returns.append(daily_return)
    
    # Calculate performance metrics
    returns = np.array(returns)
    total_return = (portfolio_values[-1] - initial_cash) / initial_cash
    sharpe_ratio = np.mean(returns) / np.std(returns) * np.sqrt(252) if np.std(returns) > 0 else 0
    
    return {
        'portfolio_values': portfolio_values,
        'total_return': total_return,
        'sharpe_ratio': sharpe_ratio
    }

# Example usage with trained models
# Assume lstm_linear, lstm_ndlinear, rnn_linear, rnn_ndlinear, tcn_linear, tcn_ndlinear, model_ndlinear, model_linear are trained
# Use test_loader created from X_test and y_test (scaled targets)
results = {}

# Simulate trading for LSTM_Linear
print("Simulating trading for LSTM_Linear...")
results['LSTM_Linear'] = simulate_trading(lstm_linear.to(device), test_loader)

# Simulate trading for LSTM_NdLinear
print("Simulating trading for LSTM_NdLinear...")
results['LSTM_NdLinear'] = simulate_trading(lstm_ndlinear.to(device), test_loader)

# Simulate trading for RNN_Linear
print("Simulating trading for RNN_Linear...")
results['RNN_Linear'] = simulate_trading(rnn_linear.to(device), test_loader)

# Simulate trading for RNN_NdLinear
print("Simulating trading for RNN_NdLinear...")
results['RNN_NdLinear'] = simulate_trading(rnn_ndlinear.to(device), test_loader)

# Simulate trading for TCN_Linear
print("Simulating trading for TCN_Linear...")
results['TCN_Linear'] = simulate_trading(tcn_linear.to(device), test_loader)

# Simulate trading for TCN_NdLinear
print("Simulating trading for TCN_NdLinear...")
results['TCN_NdLinear'] = simulate_trading(tcn_ndlinear.to(device), test_loader)

# Simulate trading for Transformer_NdLinear
print("Simulating trading for Transformer_NdLinear...")
results['Transformer_NdLinear'] = simulate_trading(model_ndlinear.to(device), test_loader)

# Simulate trading for Transformer_Linear
print("Simulating trading for Transformer_Linear...")
results['Transformer_Linear'] = simulate_trading(model_linear.to(device), test_loader)

# Plot results
plt.figure(figsize=(10, 6))
for name, res in results.items():
    plt.plot(res['portfolio_values'], label=f"{name} (Return: {res['total_return']:.2%}, Sharpe: {res['sharpe_ratio']:.2f})")
plt.title("Portfolio Value Over Time (Stock Trading)")
plt.xlabel("Time Step")
plt.ylabel("Portfolio Value ($)")
plt.legend()
plt.grid(True)
plt.savefig('trading_results.png')
plt.close()
# Print summary
for name, res in results.items():
    print(f"{name}: Final Value: ${res['portfolio_values'][-1]:.2f}, Total Return: {res['total_return']:.2%}, Sharpe Ratio: {res['sharpe_ratio']:.2f}")