In [None]:
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt 
from models.gru import GRU
from losses.correlation_loss import CorrelationLoss
from losses.hybrid_loss import HybridLoss
from models.attention_gru import AttentionGRU
import gc

In [None]:
train_path = "../datasets/btc-2024-11-24.csv"
test_path = "../datasets/btc-2024-11-25.csv"
# train_path = "/pool/btcavax/binance-usdtfutures/depth-25/btc/2024-11-24.csv"
# test_path = "/pool/btcavax/binance-usdtfutures/depth-25/btc/2024-11-25.csv"

df1 = pd.read_csv(train_path, sep="|")
df2 = pd.read_csv(test_path, sep="|")

data_df = pd.concat([df1, df2])
del df1
del df2

In [None]:
split_idx = int(len(data_df) * 0.8)
train_df = data_df.iloc[:split_idx].copy()
test_df = data_df.iloc[split_idx:].copy()

In [None]:
for dataset in [train_df, test_df]:
    dataset["datetime"] = pd.to_datetime(dataset["time"], unit="us")
    dataset.set_index("datetime", inplace=True)

    dataset["ofi"] = (dataset["bid_0_price"].diff() > 0) * dataset["bid_0_size"] \
        - (dataset["bid_0_price"].diff() < 0) * dataset["bid_0_size"].shift(1) \
        + (dataset["ask_0_price"].diff() < 0) * dataset["ask_0_size"] \
        - (dataset["ask_0_price"].diff() > 0) * dataset["ask_0_size"].shift(1)

agg_dict = {key: "last" for key in train_df.columns}
agg_dict["ofi"] = "sum"

train_df = train_df.resample("1s").agg(agg_dict).ffill() # there are some seconds where no updates occur
test_df = test_df.resample("1s").agg(agg_dict).ffill()

In [None]:
import itertools
for dataset in [train_df, test_df]:
    dataset["midprice"] = (dataset["bid_0_price"] + dataset["ask_0_price"]) / 2
    dataset["bid_volume"] = dataset[[f"bid_{i}_size" for i in range(25)]].sum(axis=1)
    dataset["ask_volume"] = dataset[[f"ask_{i}_size" for i in range(25)]].sum(axis=1)
    dataset["total_volume"] = dataset["bid_volume"] + dataset["ask_volume"]
    dataset["spread"] = (dataset["ask_0_price"] - dataset["bid_0_price"]) / dataset["ask_0_price"]
    dataset["log_return"] = np.log(dataset["midprice"]) - np.log(dataset["midprice"].shift(1))
    dataset["ob_imb"] = (dataset["bid_volume"] - dataset["ask_volume"]) / (dataset["total_volume"])
    dataset["vwap"] = (dataset["midprice"] * dataset["total_volume"]).cumsum() / dataset["total_volume"].cumsum()

    dataset["target_log_return"] = dataset["log_return"].shift(-1)
    
lagged_features = ["log_return", "vwap", "spread", "midprice"]
lagged_col_names = []

for (dataset, lag, feature) in itertools.product([train_df, test_df], [1, 10, 60], lagged_features):
    dataset[f"{feature}_lag_{lag}"] = dataset[feature].shift(lag)
    lagged_col_names.append(f"{feature}_lag_{lag}")

for dataset in [train_df, test_df]:
    dataset.dropna(inplace=True)

In [None]:
feature_cols = ["midprice", "bid_volume", "ask_volume", "total_volume", "spread", "log_return", "ob_imb", "vwap", "ofi"] \
    + [f"bid_{i}_size" for i in range(2)] \
    + [f"bid_{i}_price" for i in range(2)] \
    + [f"ask_{i}_size" for i in range(2)] \
    + [f"ask_{i}_price" for i in range(2)] \
    + lagged_col_names

target_cols = ["target_log_return"]

In [None]:
class BtcDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [None]:
# Create sequences for AttentionGRU
def create_sequences(features, targets, seq_length=5):
    X, y = [], []
    for i in range(len(features) - seq_length):
        X.append(features[i:i + seq_length])
        y.append(targets[i + seq_length])
    return torch.stack(X), torch.stack(y)

In [None]:
PARAMS = {
    "sequence_length": 20,
    "hidden_dim": 64,
    "epochs": 20,
    "batch_size": 32,
    "lr": 0.001,
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_features_df = train_df[feature_cols]
train_targets_df = train_df[target_cols]

test_features_df = test_df[feature_cols]
test_targets_df = test_df[target_cols]

train_features = torch.tensor(train_features_df.values, dtype=torch.float32).to(device)
train_targets = torch.tensor(train_targets_df.values, dtype=torch.float32).to(device)
test_features = torch.tensor(test_features_df.values, dtype=torch.float32).to(device)
test_targets = torch.tensor(test_targets_df.values, dtype=torch.float32).to(device)

train_features, train_targets = create_sequences(train_features, train_targets, PARAMS["sequence_length"])
test_features, test_targets = create_sequences(test_features, test_targets, PARAMS["sequence_length"])

In [None]:
train_dataset = BtcDataset(train_features, train_targets)
test_dataset = BtcDataset(test_features, test_targets)

train_loader = DataLoader(train_dataset, batch_size=PARAMS["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=PARAMS["batch_size"], shuffle=False)

In [None]:
gru_model = GRU(input_size=len(feature_cols), hidden_size=PARAMS["hidden_dim"], output_size=len(target_cols), device=device).to(device)
criterion = HybridLoss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=PARAMS["lr"])

for epoch in range(PARAMS["epochs"]):
    gru_model.train()  # Set the gru_model to training mode
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.float(), targets.float()

        optimizer.zero_grad()

        outputs = gru_model(inputs)

        loss = criterion(outputs, targets)
        running_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{PARAMS["epochs"]}, Training Loss: {avg_train_loss:.4f}")

In [None]:
attention_gru = AttentionGRU(len(feature_cols), PARAMS["hidden_dim"], len(feature_cols), len(target_cols), device).to(device)
criterion = HybridLoss()
optimizer = torch.optim.Adam(attention_gru.parameters(), lr=PARAMS["lr"])

for epoch in range(PARAMS["epochs"]):
    attention_gru.train()
    epoch_loss = 0

    for batch_features, batch_targets in train_loader:
        optimizer.zero_grad()
        outputs, _ = attention_gru(batch_features, batch_features)
        loss = criterion(outputs[:, -1, :], batch_targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(attention_gru.parameters(), max_norm=1.0)

        optimizer.step()
        epoch_loss += loss.item()
    print(f"epoch: {epoch+1}, loss: {epoch_loss/len(train_loader)}")

In [None]:
def evaluate_model(model, test_loader, device):
    model.eval()
    total_loss = 0.0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            # Move data to device
            batch_features = batch_features.to(device)
            batch_targets = batch_targets.to(device)
            
            # Get model predictions
            predictions = model(batch_features)
            
            # Take the last time step predictions
            # predictions = predictions[:, -1, :]

            loss = criterion(predictions, batch_targets)
            total_loss += loss.item()
            
            # Collect predictions and targets
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(batch_targets.cpu().numpy())
    
    # Concatenate all predictions and targets
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    return all_predictions, all_targets, total_loss / len(test_loader)

def plot_results(y_true, y_pred, feature_name, feature_index):
    plt.figure(figsize=(12, 6))
    plt.plot(y_true[:, feature_index], label="Actual", linestyle='', marker='o')
    plt.plot(y_pred[:, feature_index], label="Predicted", linestyle='', marker='x')
    plt.title(f"Actual vs Predicted {feature_name}", fontsize=16)
    plt.xlabel("Timestep", fontsize=14)
    plt.ylabel(feature_name, fontsize=14)
    plt.legend(fontsize=14)
    plt.grid(True)
    plt.show()

In [None]:
# Evaluate the model
predictions, targets, loss = evaluate_model(attention_gru, test_loader, device)
print("loss:", loss)

In [None]:
residuals = targets - predictions

In [None]:
corr_loss = CorrelationLoss()
corr_loss(torch.from_numpy(targets), torch.from_numpy(predictions))

In [None]:
plt.plot(residuals, linestyle='', marker='.')

In [None]:
corr_loss(targets, predictions)

In [None]:
residuals.std()

In [None]:
plot_results(targets, predictions, "log return", 0)