### Import libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from models import *

### Import data

In [2]:
# read data
# df_wide = pd.read_csv('data/market_data.csv', header=[0,1], index_col=0)
df_wide = pd.read_csv('data/trade_df.csv', header=[0,1], index_col=0)

# Convert the index to datetime (the index holds the dates)
df_wide.index = pd.to_datetime(df_wide.index)

# Transform wide format to long format by stacking the first level of the columns (i.e., tickers)
# After stacking, each row will correspond to a unique date and ticker.
df = df_wide.stack(level=0).reset_index()

# Rename the resulting columns to have a proper 'Date' and 'Ticker' columns.
df.rename(columns={'Timestamp': 'Date', 'level_1': 'Ticker'}, inplace=True)

  df = df_wide.stack(level=0).reset_index()


### Preprocess data (should be moved to a separate file)

In [3]:
# Data Cleaning and Sorting
# Remove any duplicate rows and sort by ticker and date.
df.drop_duplicates(inplace=True)
df.sort_values(['Ticker', 'Date'], inplace=True)

# drop rows with missing values # should be moved to cleaning at some point when we start to combine datasets
df.dropna(inplace=True) # happens if there was one data point the first day of a given ticker but not the rest of the values

In [4]:
# 3. Create Momentum Features # should be moved to feature engineering at some point when we start to combine datasets
momentum_periods = {'mom_1m': 1, 'mom_3m': 3, 'mom_6m': 3, 'mom_12m': 12}

# compute percentage change over each period for each ticker separately.
for feature_name, period in momentum_periods.items():
    df[feature_name] = df.groupby('Ticker')['Trade Close'].transform(lambda x: x.pct_change(periods=period))

# drop rows where momentum features are not defined (if desired).
df.dropna(subset=list(momentum_periods.keys()), inplace=True)


In [5]:
# define the target variable
# for testing we define the target as the next-day return.
df['target'] = df.groupby('Ticker')['Trade Close'].pct_change().shift(-1)
# drop na and reset index
df.dropna(subset=['target'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [6]:
display(df)

Unnamed: 0,Date,Ticker,Trade Close,Trade High,Trade Low,Trade Open,Trade Volume,mom_1m,mom_3m,mom_6m,mom_12m,target
0,2002-01-31,AAB.CO,2614.683143,2771.564132,2562.389480,2614.683143,1.063991e+02,0.000000,-0.074074,-0.074074,-0.350649,0.120000
1,2002-02-28,AAB.CO,2928.445121,2928.445121,2562.389480,2719.270469,1.944595e+02,0.120000,0.076923,0.076923,-0.222222,0.125000
2,2002-03-31,AAB.CO,3294.500761,3399.088086,2928.445121,2928.445121,1.694852e+02,0.125000,0.260000,0.260000,-0.059701,-0.063492
3,2002-04-30,AAB.CO,3085.326109,3189.913435,2928.445121,3189.913435,1.992593e+02,-0.063492,0.180000,0.180000,-0.032787,-0.169492
4,2002-05-31,AAB.CO,2562.389480,2928.445121,2510.095818,2928.445121,6.842129e+01,-0.169492,-0.125000,-0.125000,-0.109091,-0.040816
...,...,...,...,...,...,...,...,...,...,...,...,...
45781,2024-10-31,ZELA.CO,790.500000,851.500000,752.000000,813.500000,3.267692e+06,-0.027675,-0.150000,-0.150000,1.699795,-0.076534
45782,2024-11-30,ZELA.CO,730.000000,893.000000,672.500000,789.500000,4.650485e+06,-0.076534,-0.176537,-0.176537,1.172619,-0.019863
45783,2024-12-31,ZELA.CO,715.500000,819.000000,587.000000,728.000000,4.138727e+06,-0.019863,-0.119926,-0.119926,0.917203,0.027952
45784,2025-01-31,ZELA.CO,735.500000,808.000000,682.500000,718.500000,3.538390e+06,0.027952,-0.069576,-0.069576,0.557603,-0.098572


### Prepare data for training

In [7]:
feature_cols = [col for col in df.columns if col not in ['Date','Ticker','target']]
df_norm = df.dropna(subset=feature_cols)

# normalize inputs
X_values = df_norm[feature_cols].values.astype('float32')

# simple normalization: subtract mean, divide std
X_mean, X_std = X_values.mean(axis=0), X_values.std(axis=0)
X_values = (X_values - X_mean) / (X_std + 1e-8) # adding a small epsilon to avoid division by zero

y_values = df_norm['target'].values.astype('float32')

# train-validation-test split:
# set data before 2019 as training data, 2020 as validation and data after 2020 as testing data
train_indices = df_norm['Date'] < '2019-01-01'
val_indices = (df_norm['Date'] >= '2019-01-01') & (df_norm['Date'] < '2020-01-01')
test_indices = df_norm['Date'] >= '2020-01-01'
X_train, y_train = X_values[train_indices], y_values[train_indices]
X_val, y_val = X_values[val_indices], y_values[val_indices]
X_test, y_test = X_values[test_indices], y_values[test_indices]


In [8]:
# Create dataset objects for training and validation
train_dataset = MLPdataset(X_train, y_train)
val_dataset = MLPdataset(X_val, y_val)

# DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

### Train the model

In [None]:
# moving to metal or CUDA GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Using device: mps


In [10]:
input_dim = X_train.shape[1]
model = MLPModel(input_dim).to(device)

# Set regularization parameters (adjust as needed)
lambda_l1 = 1e-4
lambda_l2 = 1e-4

optimizer = optim.Adam(model.parameters(), lr=0.001)  # Note: weight_decay can also add L2 reg.
criterion = nn.MSELoss()

In [11]:
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        # Move data to the device
        batch_X, batch_y = batch_X.to(device), batch_y.to(device).unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Add L1 and L2 regularization losses
        loss += l1_regularization(model, lambda_l1)
        loss += l2_regularization(model, lambda_l2)
        
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch_X.size(0)
    train_loss /= len(train_dataset)
    
    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device).unsqueeze(1)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            # Optionally, add reg losses to validation loss if you want to monitor them:
            loss += l1_regularization(model, lambda_l1)
            loss += l2_regularization(model, lambda_l2)
            val_loss += loss.item() * batch_X.size(0)
    val_loss /= len(val_dataset)
    
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

Epoch 1/5 - Train Loss: 0.0355, Validation Loss: 0.0165
Epoch 2/5 - Train Loss: 0.0253, Validation Loss: 0.0114
Epoch 3/5 - Train Loss: 0.0222, Validation Loss: 0.0098
Epoch 4/5 - Train Loss: 0.0213, Validation Loss: 0.0093
Epoch 5/5 - Train Loss: 0.0211, Validation Loss: 0.0092
