# LSTM prediction

## Access the file in Google Drive

In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/Quantitative_trading_project/hourly_data_prediction'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Quantitative_trading_project/hourly_data_prediction


In [2]:
# Define device
import torch
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
print('Device:', device)

Device: cpu


## Import Packages

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import matplotlib.pyplot as plt

## Data Pre-processing

In [4]:
# Load data
btc_spot = pd.read_csv('data/btc_usdt_spot_data.csv')
btc_futures = pd.read_csv('data/btc_usdt_futures_data.csv')
funding_rate = pd.read_csv('data/btc_usdt_funding_rate.csv')
indicators = pd.read_csv('data/indicators.csv')
true_label = pd.read_csv('data/period_labels.csv')

In [5]:
btc_spot.head()
# print(btc_spot.columns)

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2021-01-01 00:00:00,28923.63,29031.34,28690.17,28995.13,2311.811445,1609463000000.0,66768830.0,58389.0,1215.359238,35103540.0,0.0
1,2021-01-01 01:00:00,28995.13,29470.0,28960.35,29409.99,5403.068471,1609466000000.0,158357800.0,103896.0,3160.041701,92613990.0,0.0
2,2021-01-01 02:00:00,29410.0,29465.26,29120.03,29194.65,2384.23156,1609470000000.0,69842650.0,57646.0,1203.433506,35252750.0,0.0
3,2021-01-01 03:00:00,29195.25,29367.0,29150.02,29278.4,1461.345077,1609474000000.0,42760780.0,42510.0,775.915666,22705550.0,0.0
4,2021-01-01 04:00:00,29278.41,29395.0,29029.4,29220.31,2038.046803,1609477000000.0,59614640.0,55414.0,1003.342834,29346380.0,0.0


In [6]:
# Drop unnecessary columns
columns_to_drop = ['close_time', 'ignore']
btc_spot.drop(columns=columns_to_drop, inplace=True)
btc_futures.drop(columns=columns_to_drop, inplace=True)

In [7]:
# btc_spot.head()
# print(btc_spot.columns)

In [8]:
# Set timestamp as index and convert to datetime
btc_spot['timestamp'] = pd.to_datetime(btc_spot['timestamp'])
btc_spot.set_index('timestamp', inplace=True)
btc_futures['timestamp'] = pd.to_datetime(btc_futures['timestamp'])
btc_futures.set_index('timestamp', inplace=True)
funding_rate['fundingTime'] = pd.to_datetime(funding_rate['fundingTime'])
funding_rate.set_index('fundingTime', inplace=True)
indicators['timestamp'] = pd.to_datetime(indicators['timestamp'])
indicators.set_index('timestamp', inplace=True)
true_label['time'] = pd.to_datetime(true_label['time'])
true_label.set_index('time', inplace=True)

In [9]:
def rename_col(df, name):
  for column in df.columns:
    df.rename(columns={column: f'{name}_{column}'}, inplace=True)
  return df

In [10]:
btc_spot = rename_col(btc_spot, 'btc_spot')
btc_futures = rename_col(btc_futures, 'btc_futures')

In [11]:
# shift because we aim to predict the label for next K bar
true_label = true_label.shift(-1)

# Merge DataFrames on the timestamp index
df = pd.concat([true_label, btc_spot, btc_futures, funding_rate, indicators], axis=1)
df.head()

Unnamed: 0,label,btc_spot_open,btc_spot_high,btc_spot_low,btc_spot_close,btc_spot_volume,btc_spot_quote_asset_volume,btc_spot_number_of_trades,btc_spot_taker_buy_base_asset_volume,btc_spot_taker_buy_quote_asset_volume,...,obv,cmf,williams,parabolic_sar,vwap,fibonacci_0.236,fibonacci_0.382,fibonacci_0.5,fibonacci_0.618,fibonacci_0.764
2021-01-01 00:00:00,,28923.63,29031.34,28690.17,28995.13,2311.811445,66768830.0,58389.0,1215.359238,35103540.0,...,8037.588,,,29015.0,,,,,,
2021-01-01 01:00:00,,28995.13,29470.0,28960.35,29409.99,5403.068471,158357800.0,103896.0,3160.041701,92613990.0,...,27580.989,,,29448.4,,,,,,
2021-01-01 02:00:00,,29410.0,29465.26,29120.03,29194.65,2384.23156,69842650.0,57646.0,1203.433506,35252750.0,...,17531.269,,,29055.0,,,,,,
2021-01-01 03:00:00,,29195.25,29367.0,29150.02,29278.4,1461.345077,42760780.0,42510.0,775.915666,22705550.0,...,23110.721,,,29159.16,,,,,,
2021-01-01 04:00:00,,29278.41,29395.0,29029.4,29220.31,2038.046803,59614640.0,55414.0,1003.342834,29346380.0,...,14759.038,,,29379.41,,,,,,


In [12]:
df.shape

(30265, 48)

In [13]:
# Number of previous hours to include
n_hours = 24

# Create a list to hold the lagged DataFrames
lagged_data = []

# Add the original columns with a suffix indicating the lag (0 for the original values)
for i in range(1, n_hours + 1):
    lagged_df = df.drop(columns='label').shift(i).add_suffix(f'_lag_{i}')
    lagged_data.append(lagged_df)

# Concatenate all the lagged DataFrames along the columns
df_lagged = pd.concat(lagged_data, axis=1)
df = pd.concat([df, df_lagged], axis=1)

# Drop rows with any NaN values (since the first `n_hours` rows will have NaNs)
df.dropna(inplace=True)

In [14]:
df

Unnamed: 0,label,btc_spot_open,btc_spot_high,btc_spot_low,btc_spot_close,btc_spot_volume,btc_spot_quote_asset_volume,btc_spot_number_of_trades,btc_spot_taker_buy_base_asset_volume,btc_spot_taker_buy_quote_asset_volume,...,obv_lag_24,cmf_lag_24,williams_lag_24,parabolic_sar_lag_24,vwap_lag_24,fibonacci_0.236_lag_24,fibonacci_0.382_lag_24,fibonacci_0.5_lag_24,fibonacci_0.618_lag_24,fibonacci_0.764_lag_24
2021-01-03 09:00:00,1.0,34367.35,34588.88,33800.00,34190.55,6052.678972,2.067135e+08,114590.0,2746.050925,9.380894e+07,...,-27258.293,0.159393,-28.948036,29086.738342,29499.116442,29637.80032,29444.65984,29288.56,29132.46016,28939.31968
2021-01-03 10:00:00,1.0,34189.98,34350.00,33403.00,33877.96,6335.226642,2.146157e+08,120518.0,3038.836934,1.029609e+08,...,-14893.311,0.190169,-24.375857,29207.594974,29542.925789,29637.80032,29444.65984,29288.56,29132.46016,28939.31968
2021-01-03 11:00:00,1.0,33877.96,34450.00,33787.55,34413.53,4116.853141,1.405351e+08,92262.0,2255.037874,7.698679e+07,...,-9957.261,0.219656,-17.592966,29311.531678,29562.119123,29637.80032,29444.65984,29288.56,29132.46016,28939.31968
2021-01-03 12:00:00,1.0,34413.53,34600.00,33928.75,34103.72,4546.283481,1.558306e+08,89900.0,2266.261717,7.771550e+07,...,45968.186,0.283496,-14.980817,29400.917243,29866.657284,30409.44032,30068.83984,29793.56,29518.28016,29177.67968
2021-01-03 13:00:00,1.0,34103.73,34385.02,33800.00,33880.00,4373.738376,1.490315e+08,87957.0,2050.560801,6.989017e+07,...,86643.525,0.414243,-0.832283,29646.530000,30150.977263,30928.05880,30488.35060,30132.97,29777.58940,29337.88120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-08 19:00:00,0.0,69454.88,69490.00,69427.88,69460.01,249.356310,1.732177e+07,19643.0,124.499700,8.648333e+06,...,-2973368.200,0.007592,-76.405456,71808.239469,70495.484262,71261.59600,70715.70200,70274.50,69833.29800,69287.40400
2024-06-08 20:00:00,0.0,69460.00,69475.18,69313.31,69372.73,304.034710,2.109851e+07,21153.0,118.578180,8.228474e+06,...,-2965090.886,0.024501,-75.044129,71604.045101,70443.592525,71261.59600,70715.70200,70274.50,69833.29800,69287.40400
2024-06-08 21:00:00,1.0,69372.74,69424.47,69327.04,69404.91,179.287950,1.243825e+07,17777.0,90.065730,6.248395e+06,...,-2968656.371,0.013742,-76.603370,71412.102395,70411.067359,71261.59600,70715.70200,70274.50,69833.29800,69287.40400
2024-06-08 22:00:00,1.0,69404.90,69404.91,69290.28,69338.15,201.178050,1.395030e+07,21774.0,95.240690,6.604005e+06,...,-2964128.017,0.013680,-69.700455,71231.676251,70382.359238,71261.59600,70715.70200,70274.50,69833.29800,69287.40400


In [15]:
df.shape

(28227, 1176)

In [16]:
# Split data into training and testing sets by date 2023-06-30
training_data = df.loc[df.index <= '2023-06-30']
testing_data = df.loc[df.index > '2023-06-30']

# Print shapes to verify the split
print(f'Training data shape: {training_data.shape}')
print(f'Testing data shape: {testing_data.shape}')

Training data shape: (20345, 1176)
Testing data shape: (7882, 1176)


In [17]:
print(training_data)
print(testing_data)

                     label  btc_spot_open  btc_spot_high  btc_spot_low  \
2021-01-03 09:00:00    1.0       34367.35       34588.88      33800.00   
2021-01-03 10:00:00    1.0       34189.98       34350.00      33403.00   
2021-01-03 11:00:00    1.0       33877.96       34450.00      33787.55   
2021-01-03 12:00:00    1.0       34413.53       34600.00      33928.75   
2021-01-03 13:00:00    1.0       34103.73       34385.02      33800.00   
...                    ...            ...            ...           ...   
2023-06-29 20:00:00    1.0       30584.99       30639.90      30250.00   
2023-06-29 21:00:00    1.0       30398.97       30454.76      30366.84   
2023-06-29 22:00:00    1.0       30417.33       30460.00      30395.03   
2023-06-29 23:00:00    1.0       30409.35       30526.42      30392.78   
2023-06-30 00:00:00    1.0       30447.31       30511.38      30400.00   

                     btc_spot_close  btc_spot_volume  \
2021-01-03 09:00:00        34190.55      6052.678972   

In [18]:
# Normalize the dataset
scaler = MinMaxScaler()
training_data = scaler.fit_transform(training_data)
testing_data = scaler.transform(testing_data)

training_data

array([[1.        , 0.35327833, 0.35503642, ..., 0.26036164, 0.26024129,
        0.25985995],
       [1.        , 0.3499309 , 0.35052972, ..., 0.26036164, 0.26024129,
        0.25985995],
       [1.        , 0.34404227, 0.35241631, ..., 0.26036164, 0.26024129,
        0.25985995],
       ...,
       [1.        , 0.27873121, 0.27714114, ..., 0.2791978 , 0.28009434,
        0.28096488],
       [1.        , 0.27858061, 0.27839421, ..., 0.27902177, 0.27995927,
        0.28088104],
       [1.        , 0.27929701, 0.27811047, ..., 0.27862618, 0.27965571,
        0.28069265]])

### Prepared data for LSTM model

In [19]:
# Remove the first column (true labels)
X_train = training_data[:, 1:]
y_train = training_data[:, 0]
X_test = testing_data[:, 1:]
y_test = testing_data[:, 0]

# Reshape to [samples, time steps, features]
X_train = X_train.reshape(X_train.shape[0], 25, -1)
y_train = y_train.reshape(y_train.shape[0], 1)
X_test = X_test.reshape(X_test.shape[0], 25, -1)
y_test = y_test.reshape(y_test.shape[0], 1)

In [20]:
print(X_train)
print(X_train.shape)
print(y_train)
print(y_train.shape)
print(X_test)
print(X_test.shape)
print(y_test)
print(y_test.shape)

[[[0.35327833 0.35503642 0.34589777 ... 0.31593947 0.3068507  0.29523181]
  [0.35487174 0.35657456 0.34509154 ... 0.31593947 0.3068507  0.29523181]
  [0.34572514 0.35860642 0.34471212 ... 0.31593947 0.3068507  0.29523181]
  ...
  [0.26480663 0.26320754 0.26704557 ... 0.26036164 0.26024129 0.25985995]
  [0.26354915 0.26411812 0.2642352  ... 0.26036164 0.26024129 0.25985995]
  [0.2679208  0.26543316 0.26620103 ... 0.26036164 0.26024129 0.25985995]]

 [[0.3499309  0.35052972 0.33840369 ... 0.3174429  0.30871709 0.29754968]
  [0.35327833 0.35503642 0.34589777 ... 0.31593947 0.3068507  0.29523181]
  [0.35487174 0.35657456 0.34509154 ... 0.31593947 0.3068507  0.29523181]
  ...
  [0.26623114 0.28401291 0.26928436 ... 0.27013043 0.26773741 0.26451222]
  [0.26480663 0.26320754 0.26704557 ... 0.26036164 0.26024129 0.25985995]
  [0.26354915 0.26411812 0.2642352  ... 0.26036164 0.26024129 0.25985995]]

 [[0.34404227 0.35241631 0.34566275 ... 0.31853729 0.3100757  0.29923693]
  [0.3499309  0.350529

In [21]:
# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

print(X_train)
print(X_train.shape)

tensor([[[0.3533, 0.3550, 0.3459,  ..., 0.3159, 0.3069, 0.2952],
         [0.3549, 0.3566, 0.3451,  ..., 0.3159, 0.3069, 0.2952],
         [0.3457, 0.3586, 0.3447,  ..., 0.3159, 0.3069, 0.2952],
         ...,
         [0.2648, 0.2632, 0.2670,  ..., 0.2604, 0.2602, 0.2599],
         [0.2635, 0.2641, 0.2642,  ..., 0.2604, 0.2602, 0.2599],
         [0.2679, 0.2654, 0.2662,  ..., 0.2604, 0.2602, 0.2599]],

        [[0.3499, 0.3505, 0.3384,  ..., 0.3174, 0.3087, 0.2975],
         [0.3533, 0.3550, 0.3459,  ..., 0.3159, 0.3069, 0.2952],
         [0.3549, 0.3566, 0.3451,  ..., 0.3159, 0.3069, 0.2952],
         ...,
         [0.2662, 0.2840, 0.2693,  ..., 0.2701, 0.2677, 0.2645],
         [0.2648, 0.2632, 0.2670,  ..., 0.2604, 0.2602, 0.2599],
         [0.2635, 0.2641, 0.2642,  ..., 0.2604, 0.2602, 0.2599]],

        [[0.3440, 0.3524, 0.3457,  ..., 0.3185, 0.3101, 0.2992],
         [0.3499, 0.3505, 0.3384,  ..., 0.3174, 0.3087, 0.2975],
         [0.3533, 0.3550, 0.3459,  ..., 0.3159, 0.3069, 0.

In [22]:
# Create DataLoader
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

## Build LSTM Model

In [23]:
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=50, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.drop_out = nn.Dropout(p=0.5)
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation function

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.drop_out(out)
        linear_out = self.linear(out)
        predictions = self.sigmoid(linear_out)  # Apply sigmoid activation
        return predictions

In [24]:
# hyperparameters
input_size = X_train.shape[2]
hidden_layer_size = 256
output_size = 1 # Binary classification
learning_rate = 0.001

In [25]:
# Initialize model and move it to the GPU
model = LSTM(input_size, hidden_layer_size, output_size).to(device)
loss_function = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training & Testing

In [26]:
# recording losses and accuracies for each epoch
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []

In [None]:
epochs = 50

# Training model
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_function(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        # Calculate accuracy
        predicted = (y_pred > 0.5).float()
        correct_train += (predicted == y_batch).sum().item()
        total_train += y_batch.size(0)

    train_losses.append(train_loss / len(train_loader))
    train_accuracy = 100 * correct_train / total_train
    train_accuracies.append(train_accuracy)

    model.eval()
    test_loss = 0.0
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = loss_function(y_pred, y_batch)
            test_loss += loss.item()

            # Calculate accuracy
            predicted = (y_pred > 0.5).float()
            correct_test += (predicted == y_batch).sum().item()
            total_test += y_batch.size(0)

    test_losses.append(test_loss / len(test_loader))
    test_accuracy = 100 * correct_test / total_test
    test_accuracies.append(test_accuracy)

    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_losses[-1]}, Testing Loss: {test_losses[-1]}, Training Accuracy: {train_accuracy}%, Testing Accuracy: {test_accuracy}%')

Epoch 1/50, Training Loss: 0.6599026514486697, Testing Loss: 0.6600583914803108, Training Accuracy: 63.76996805111821%, Testing Accuracy: 62.91550367926922%
Epoch 2/50, Training Loss: 0.6569106391859505, Testing Loss: 0.6602994408684704, Training Accuracy: 63.819120176947656%, Testing Accuracy: 62.91550367926922%
Epoch 3/50, Training Loss: 0.65577911182582, Testing Loss: 0.6604303910664702, Training Accuracy: 63.819120176947656%, Testing Accuracy: 62.91550367926922%
Epoch 4/50, Training Loss: 0.6559351222039019, Testing Loss: 0.6607177023462921, Training Accuracy: 63.819120176947656%, Testing Accuracy: 62.91550367926922%
Epoch 5/50, Training Loss: 0.6554162576794624, Testing Loss: 0.6606287062168121, Training Accuracy: 63.819120176947656%, Testing Accuracy: 62.91550367926922%
Epoch 6/50, Training Loss: 0.6552746849333715, Testing Loss: 0.6604710595327833, Training Accuracy: 63.819120176947656%, Testing Accuracy: 62.91550367926922%
Epoch 7/50, Training Loss: 0.6553628141962508, Testing 

### Visualize data

In [None]:
# Plotting the training loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()

In [None]:
# Plotting the training accuracy
plt.figure(figsize=(10, 5))
plt.plot(train_accuracies, label='Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Training Accuracy Over Epochs')
plt.legend()
plt.show()

In [None]:
# Plotting the testing loss
plt.figure(figsize=(10, 5))
plt.plot(test_losses, label='Testing Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Testing Loss Over Epochs')
plt.legend()
plt.show()

In [None]:
# Plotting the testing accuracy
plt.figure(figsize=(10, 5))
plt.plot(test_accuracies, label='Testing Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Testing Accuracy Over Epochs')
plt.legend()
plt.show()

## Output result

In [None]:
# Save testing results to CSV
df_pred = pd.DataFrame({'prediction': predictions})
df_pred = df_pred['prediction'].apply(lambda x: x[0])
df_pred.index = df.loc[df.index > '2023-06-30'].index
df_pred

In [None]:
df_pred.to_csv('data/testing_results.csv', index=True)
print('Testing results saved to data/testing_results.csv')