In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [21]:
# -----------------------------
# 1. Read and Transform the Data
# -----------------------------
# The CSV file is wide with a MultiIndex on the columns.
# We assume the file's first column is the date (although unnamed) and is used as the index.
# Read the CSV file with a two-level header and set the first column as index.
df_wide = pd.read_csv('data/market_data.csv', header=[0,1], index_col=0)

# Convert the index to datetime (the index holds the dates)
df_wide.index = pd.to_datetime(df_wide.index)

# Transform wide format to long format by stacking the first level of the columns (i.e., tickers)
# After stacking, each row will correspond to a unique date and ticker.
df = df_wide.stack(level=0).reset_index()

# Rename the resulting columns to have a proper 'Date' and 'Ticker' columns.
df.rename(columns={'Timestamp': 'Date', 'level_1': 'Ticker'}, inplace=True)

# For illustration, assume that the only data variable is "Price".
# If there are more variables, adjust feature selection accordingly.
# Check resulting columns:
print(df.columns)  # Expected: ['Date', 'Ticker', 'Price', ...]

  df_wide = pd.read_csv('data/market_data.csv', header=[0,1], index_col=0)
  df = df_wide.stack(level=0).reset_index()


Index(['Date', 'Ticker', 'Ask Close', 'Bid Close', 'PE Ratio', 'Trade Close',
       'Trade High', 'Trade Low', 'Trade Open', 'Trade Volume', 'Turnover',
       'PERATIO', 'BID', 'ASK', 'TRDPRC_1', 'TRNOVR_UNS'],
      dtype='object')


In [22]:
# -----------------------------
# 2. Data Cleaning and Sorting
# -----------------------------
# Remove any duplicate rows and sort by ticker and date.
df.drop_duplicates(inplace=True)
df.sort_values(['Ticker', 'Date'], inplace=True)

# remove columns if there is only missing values
df.dropna(axis=1, how='all', inplace=True)


In [25]:
# -----------------------------
# 3. Create Momentum Features
# -----------------------------
# We work with the "Price" column for momentum calculations.
# Define momentum lookback periods (approximations: 1m ~ 21 days, 3m ~ 63, 6m ~ 126, 12m ~ 252 trading days)
momentum_periods = {'mom_1m': 21, 'mom_3m': 63, 'mom_6m': 126, 'mom_12m': 252}

# Compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('Ticker')['Trade Close'].apply(lambda x: x.pct_change(periods=period))

# Drop rows where momentum features are not defined (if desired).
df.dropna(subset=list(momentum_periods.keys()), inplace=True)


  df[feature_name] = df.groupby('Ticker')['Trade Close'].apply(lambda x: x.pct_change(periods=period))


TypeError: incompatible index of inserted column with frame index

In [None]:
# -----------------------------
# 4. Define the Target Variable
# -----------------------------
# Here we define the target as the next-day return.
df['target'] = df.groupby('Ticker')['Price'].pct_change().shift(-1)
df.dropna(subset=['target'], inplace=True)

In [None]:
# -----------------------------
# 5. Prepare Dataset for LSTM
# -----------------------------
# We create a custom dataset that groups the data by ticker so that each stock's history is a sequence.
feature_cols = list(momentum_periods.keys())

class StockDataset(Dataset):
    def __init__(self, df, feature_cols, group_col='Ticker'):
        self.sequences = []
        self.targets = []
        # Group the data by each ticker
        grouped = df.groupby(group_col)
        for ticker, group in grouped:
            group = group.sort_values('Date')
            # Extract the momentum features and target as numpy arrays.
            features = group[feature_cols].values  # shape: (sequence_length, num_features)
            target = group['target'].values          # shape: (sequence_length,)
            # Convert to torch tensors.
            self.sequences.append(torch.tensor(features, dtype=torch.float))
            self.targets.append(torch.tensor(target, dtype=torch.float))
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    # Define a collate function to pad sequences to the same length in a batch.
def collate_fn(batch):
    sequences, targets = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    sequences_padded = pad_sequence(sequences, batch_first=True)
    targets_padded = pad_sequence(targets, batch_first=True)
    return sequences_padded, targets_padded, lengths


In [None]:
# -----------------------------
# 6. Build the LSTM Model in PyTorch
# -----------------------------
# The model will process each sequence and use the last valid hidden state to predict the next-day return.
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, lengths):
        # Pack the padded sequences so that the LSTM ignores the padding.
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_out, (hn, cn) = self.lstm(packed)
        # Use the last hidden state of the final layer as the sequence representation.
        out = self.fc(hn[-1])
        return out


In [None]:
# -----------------------------
# 7. Training the Model
# -----------------------------
# Set hyperparameters.
input_dim = len(feature_cols)   # Number of momentum features.
hidden_dim = 64
num_layers = 2
output_dim = 1                  # Predicting next-day return (a continuous value).
num_epochs = 50
learning_rate = 0.001

# Instantiate the model, loss function, and optimizer.
model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create dataset and DataLoader.
dataset = StockDataset(df, feature_cols=feature_cols)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

# Training loop.
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for sequences, targets, lengths in dataloader:
        optimizer.zero_grad()
        # Forward pass: get one prediction per sequence.
        outputs = model(sequences, lengths)
        # Use the last valid target value from each sequence.
        last_targets = torch.stack([target[length-1] for target, length in zip(targets, lengths)])
        loss = criterion(outputs.squeeze(), last_targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader):.4f}')