In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import models as model

In [12]:
# read data
# df_wide = pd.read_csv('data/market_data.csv', header=[0,1], index_col=0)
df_wide = pd.read_csv('data/trade_df.csv', header=[0,1], index_col=0)

# Convert the index to datetime (the index holds the dates)
df_wide.index = pd.to_datetime(df_wide.index)

# Transform wide format to long format by stacking the first level of the columns (i.e., tickers)
# After stacking, each row will correspond to a unique date and ticker.
df = df_wide.stack(level=0).reset_index()

# Rename the resulting columns to have a proper 'Date' and 'Ticker' columns.
df.rename(columns={'Timestamp': 'Date', 'level_1': 'Ticker'}, inplace=True)

# For illustration, assume that the only data variable is "Price".
# If there are more variables, adjust feature selection accordingly.
# Check resulting columns:
print(df.columns)  # Expected: ['Date', 'Ticker', 'Price', ...]

Index(['Date', 'Ticker', 'Trade Close', 'Trade High', 'Trade Low',
       'Trade Open', 'Trade Volume'],
      dtype='object')


  df = df_wide.stack(level=0).reset_index()


In [None]:
# Data Cleaning and Sorting
# Remove any duplicate rows and sort by ticker and date.
df.drop_duplicates(inplace=True)
df.sort_values(['Ticker', 'Date'], inplace=True)

# drop rows with missing values
df.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values

Date            0
Ticker          0
Trade Close     0
Trade High      0
Trade Low       0
Trade Open      0
Trade Volume    0
dtype: int64
(1032572, 7)


Unnamed: 0,Date,Ticker,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume
1032674,2001-01-02,AAB.CO,2719.270469,2876.151458,2666.976806,2771.564132,106.131407
1032498,2001-01-03,AAB.CO,2771.564132,2876.151458,2614.683143,2614.683143,18.013655
1032309,2001-01-04,AAB.CO,2717.178723,2771.564132,2614.683143,2614.683143,15.585062
1032115,2001-01-05,AAB.CO,2719.270469,2719.270469,2666.976806,2666.976806,3.824555
1031918,2001-01-08,AAB.CO,2666.976806,2666.976806,2666.976806,2666.976806,11.836998
...,...,...,...,...,...,...,...
586,2025-02-27,ZELA.CO,695.000000,703.000000,685.000000,692.500000,156528.000000
467,2025-02-28,ZELA.CO,663.000000,692.000000,657.000000,688.000000,316329.000000
350,2025-03-03,ZELA.CO,610.000000,685.000000,610.000000,664.500000,424168.000000
233,2025-03-04,ZELA.CO,566.500000,625.000000,557.000000,610.000000,595943.000000


In [17]:
# 3. Create Momentum Features
# We work with the "Price" column for momentum calculations.
# Define momentum lookback periods (approximations: 1m ~ 21 days, 3m ~ 63, 6m ~ 126, 12m ~ 252 trading days)
momentum_periods = {'mom_1m': 21, 'mom_3m': 63, 'mom_6m': 126, 'mom_12m': 252}

# compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('Ticker')['Trade Close'].transform(lambda x: x.pct_change(periods=period))

# drop rows where momentum features are not defined (if desired).
df.dropna(subset=list(momentum_periods.keys()), inplace=True)


In [None]:
# define the target variable
# for testing we define the target as the next-day return.
df['target'] = df.groupby('Ticker')['Trade Close'].pct_change().shift(-1)
df.dropna(subset=['target'], inplace=True)

In [None]:
# -----------------------------
# 5. Prepare Dataset for LSTM
# -----------------------------
# We create a custom dataset that groups the data by ticker so that each stock's history is a sequence.
feature_cols = list(momentum_periods.keys())

class StockDataset(Dataset):
    def __init__(self, df, feature_cols, group_col='Ticker'):
        self.sequences = []
        self.targets = []
        # Group the data by each ticker
        grouped = df.groupby(group_col)
        for ticker, group in grouped:
            group = group.sort_values('Date')
            # Extract the momentum features and target as numpy arrays.
            features = group[feature_cols].values  # shape: (sequence_length, num_features)
            target = group['target'].values          # shape: (sequence_length,)
            # Convert to torch tensors.
            self.sequences.append(torch.tensor(features, dtype=torch.float))
            self.targets.append(torch.tensor(target, dtype=torch.float))
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    # Define a collate function to pad sequences to the same length in a batch.
def collate_fn(batch):
    sequences, targets = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    sequences_padded = pad_sequence(sequences, batch_first=True)
    targets_padded = pad_sequence(targets, batch_first=True)
    return sequences_padded, targets_padded, lengths


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, lengths):
        # Ensure x is on the same device as the model parameters.
        x = x.to(next(self.parameters()).device)
        
        # Pack the padded sequences so that the LSTM ignores the padding.
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_out, (hn, cn) = self.lstm(packed)
        
        # Use the last hidden state of the final layer as the sequence representation.
        out = self.fc(hn[-1])
        return out

In [None]:
# Set device (MPS if available, otherwise CPU).
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f'Using device: {device}')

# Set hyperparameters.
input_dim = len(feature_cols)   # Number of momentum features.
hidden_dim = 12
num_layers = 2
output_dim = 1
num_epochs = 50
learning_rate = 0.001

# Instantiate the model, loss function, and optimizer.
model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create dataset and DataLoader.
dataset = StockDataset(df, feature_cols=feature_cols)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

# Training loop.
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for sequences, targets, lengths in dataloader:
        optimizer.zero_grad()
        
        # Move sequences to the appropriate device.
        sequences = sequences.to(device)
        
        # Move each target tensor to device.
        targets = [target.to(device) for target in targets]
        
        # Forward pass: get one prediction per sequence.
        outputs = model(sequences, lengths)
        
        # Use the last valid target value from each sequence.
        last_targets = torch.stack([target[length-1] for target, length in zip(targets, lengths)])
        
        loss = criterion(outputs.squeeze(), last_targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader):.4f}')