In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

df = pd.read_csv("test.csv")

df['duration'] = df['close_timestamp'] - df['open_timestamp']
# Approximate candle volume as mean price * contracts traded
df['volumeUsd'] = df['volume'] * (df['close'] + df['open']) / 2 
df['t'] = pd.to_datetime(df['open_timestamp'], unit='ms')
df = df[['t', 'duration', 'close', 'transactions', 'volumeUsd', 'symbol']]
df.head()

Unnamed: 0,t,duration,close,transactions,volumeUsd,symbol
0,2024-12-31 15:45:00,899999,95440.0,2024,18921880.0,BTC
1,2024-12-31 16:00:00,899999,95507.0,2399,21217180.0,BTC
2,2024-12-31 16:15:00,899999,95020.0,1705,21550680.0,BTC
3,2024-12-31 16:30:00,899999,94901.0,2254,18986600.0,BTC
4,2024-12-31 16:45:00,899999,94902.0,981,10469990.0,BTC


## lstm

The purpose of this notebook is to fit a LSTM model to predict the future returns of a futures contract (ENA), given information about the market (volume and technicals) and the indexes (BTC, ETH).

We will be formatting the data accordingly before fitting an LSTM model to some training set of data, and then evaluating the predictive power of this model on a testing set of data to validate whether it is useful.

This is simply a training exercise, we don't recommend using this code for anything more than this.

In [2]:
# Group the data by symbol to calculate the required technical indicators
grouped = df.groupby('symbol')
timeframes = [1, 2, 4, 8, 16]

def compute_indicators(group):
    for tf in timeframes:
        group[f'return_{tf}'] = group['close'].pct_change(periods=tf)
        group[f'volume_change_{tf}'] = group['volumeUsd'].diff(periods=tf)
        if tf != 1:
            group[f'ema_volume_{tf}'] = group['volumeUsd'].ewm(span=tf, adjust=False).mean()
    
    return group

tdf = grouped.apply(compute_indicators)

# tdf.reset_index(inplace=True)

In [78]:
try:
    tdf.drop(['duration', 'close', 'transactions'], axis=1, inplace=True)
except:
    pass
    
btc = tdf.loc['BTC']
eth = tdf.loc['ETH']
ena = tdf.loc['ENA']

# Take 60% of the data for training
train_split_index = 3 * ena.shape[0] // 5
train_split_time = ena.iloc[train_split_index]['t']

# Rename all columns to add asset prefix
btc = btc.rename(columns=lambda col: f'btc_{col}' if col != 't' else 't')
eth = eth.rename(columns=lambda col: f'eth_{col}' if col != 't' else 't')
ena = ena.rename(columns=lambda col: f'ena_{col}' if col != 't' else 't')

df = btc.merge(eth, on='t', how='outer').merge(ena, on='t', how='outer')

# Specify explicitly the feature which is our Y dependent variable
df['Y'] = df['ena_return_2'].shift(-2)

# Perform a train-test split of the data on the given split time
df_train, df_test = df.loc[df.t <= train_split_time], df.loc[df.t > train_split_time]

In [79]:
features = (
    [f'{asset}_return_{i}' for asset in ['btc', 'eth', 'ena'] for i in timeframes] + 
    [f'{asset}_volume_change_{i}' for asset in ['ena'] for i in timeframes] + 
    [f'{asset}_ema_volume_{i}' for asset in ['ena'] for i in timeframes if i != 1]
)

df_train = df_train[['t'] + features + ['Y']]

df_train.dropna(inplace=True)

In [80]:
sequence_length = 20
X_ = df_train[features].values
Y_ = df_train["Y"].values

X, y = [], []
num_samples = (len(df_train) - sequence_length)

for i in range(0, num_samples):
    X.append(X_[i : i + sequence_length])
    y.append(Y_[i + sequence_length])

# Cast X and Y to numpy objects
X = np.array(X)
y = np.array(y)

In [81]:
X_tensor = torch.tensor(X, dtype=torch.float32)  # Convert to PyTorch tensor
y_tensor = torch.tensor(y, dtype=torch.float32)  # Convert to PyTorch tensor

print(X_tensor.shape, y_tensor.shape)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)

print(X_train.shape, X_val.shape)  # Confirm shapes

from torch.utils.data import TensorDataset, DataLoader

batch_size = 64  # Define batch size

# Create dataset
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("DataLoaders created successfully.")

torch.Size([2965, 20, 24]) torch.Size([2965])
torch.Size([2372, 20, 24]) torch.Size([593, 20, 24])
DataLoaders created successfully.


In [82]:
import torch.nn as nn

class LSTMForecast(nn.Module):
    def __init__(self, input_size, hidden_size=50, num_layers=2, dropout=0.2):
        super(LSTMForecast, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           batch_first=True,
                            dropout=dropout)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        last_out = lstm_out[:, -1, :]
        
        out = self.fc(last_out)
        
        return out

In [83]:
# Instantiate the model
input_dim = X_train.shape[2]   # number of features (should be 4 in our case)
model = LSTMForecast(input_size=input_dim, hidden_size=10, num_layers=2, dropout=0.2)

# Define loss function and optimizer
criterion = nn.MSELoss()                          # Mean Squared Error loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [84]:
num_epochs = 50
for epoch in range(num_epochs):
    model.train()  # set model to training mode
    epoch_loss = 0.0
    
    for batch_X, batch_y in train_loader:
        # Forward pass: compute prediction
        pred = model(batch_X)             # pred shape: (batch, 1)
        pred = pred.squeeze(1)            # squeeze to (batch,) to match batch_y shape
        
        loss = criterion(pred, batch_y)   # MSE loss between predicted and actual returns
        
        # Backpropagation and optimization step
        optimizer.zero_grad()             # reset gradients from previous step
        loss.backward()                   # compute gradients of loss w.r.t. model parameters
        optimizer.step()                  # update parameters
        
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)       # average loss over all batches
    
    # Print loss every 10 epochs (or every epoch if desired)
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Training MSE: {epoch_loss:.6f}")


Epoch 10/50, Training MSE: 0.000271
Epoch 20/50, Training MSE: 0.000185
Epoch 30/50, Training MSE: 0.000167
Epoch 40/50, Training MSE: 0.000165
Epoch 50/50, Training MSE: 0.000167


In [85]:
model.eval()  # switch to evaluation mode (disables dropout, etc.)
with torch.no_grad():  # no_grad disables gradient computations, for speed
    # Predict on training data (to assess fit) and test data
    train_pred = model(X_train).squeeze(1).numpy()
    val_pred = model(X_val).squeeze(1).numpy()
    Y_train_true = y_train.numpy()
    Y_val_true = y_val.numpy()

# Calculate evaluation metrics
def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return 1 - ss_res/ss_tot

train_mse = np.mean((Y_train_true - train_pred)**2)
val_mse  = np.mean((Y_val_true - val_pred)**2)
train_rmse = np.sqrt(train_mse)
val_rmse  = np.sqrt(val_mse)
val_r2 = r2_score(Y_val_true, val_pred)

print(f"Train MSE: {train_mse:.6f}, Train RMSE: {train_rmse:.6f}")
print(f"Validation MSE: {val_mse:.6f}, Validation RMSE: {val_rmse:.6f}, Validation R^2: {val_r2:.3f}")

Train MSE: 0.000157, Train RMSE: 0.012522
Validation MSE: 0.000149, Validation RMSE: 0.012225, Validation R^2: -0.039


In [86]:
np.where(Y_val_true * val_pred > 0, 1, 0).sum() / len(Y_val_true)

0.5075885328836425

In [87]:
sequence_length = 20

df_test.dropna(inplace=True)

X_ = df_test[features].values
Y_ = df_test["Y"].values

X, y = [], []
num_samples = (len(df_test) - sequence_length)

for i in range(0, num_samples):
    X.append(X_[i : i + sequence_length])
    y.append(Y_[i + sequence_length])

# Cast X and Y to numpy objects
X = np.array(X)
y = np.array(y)

X_tensor = torch.tensor(X, dtype=torch.float32)  # Convert to PyTorch tensor
y_tensor = torch.tensor(y, dtype=torch.float32)  # Convert to PyTorch tensor

model.eval()  # switch to evaluation mode (disables dropout, etc.)
with torch.no_grad():  # no_grad disables gradient computations, for speed
    # Predict on training data (to assess fit) and test data
    test_pred = model(X_tensor).squeeze(1).numpy()
    Y_test_true = y_tensor.numpy()

test_mse  = np.mean((Y_test_true - test_pred)**2)
test_rmse  = np.sqrt(test_mse)
test_r2 = r2_score(Y_test_true, test_pred)

print(f"Test MSE: {test_mse:.6e}, Test RMSE: {test_rmse:.6f}, Test R^2: {test_r2:.3f}")

Test MSE: 1.933912e-04, Test RMSE: 0.013907, Test R^2: -0.056


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.dropna(inplace=True)


In [88]:
test_pred * 

array([-0.00085968, -0.0045571 , -0.00128107, ...,  0.00175717,
        0.00066167,  0.00079842], dtype=float32)