In [None]:
import torch
from torch.utils.data import IterableDataset, DataLoader, Subset
from datetime import datetime as dt, timedelta
import pandas as pd
import os
import random
import numpy as np
import torch.nn as nn
from pandas import DataFrame as df
import mplfinance as mpf
#import mathplotlib.pyplot as plt

#### Check device and assign device

In [None]:
# check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


#### Assign Seed

In [None]:
seed = 42  # choose any seed you prefer
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

##### Build Custom Dataset Class

In [None]:
class PriceDataset(torch.utils.data.IterableDataset):
    def __init__(self, item, timespan, start_date_str, end_date_str):
        self.directory = f'csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [0, 1, 2, 3, 4, 7]
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        filename = self.filenames[idx]
        df = pd.read_csv(filename, usecols=self.columns, header=None)
        return torch.tensor(df.values, dtype=torch.float32)

# Dataset is supposed to be [number of dates used][number of rows in each csv file][6, means the number of columns we are using in each csv file]


In [None]:
dataset1 = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')

print(dataset1.__getitem__(1))



def plot_chart(data):
    df = pd.DataFrame(data, columns=['Open time', 'Open', 'High', 'Low', 'Close', 'Volume'])
    
    # Convert the 'Open time' column to a datetime object
    df['Open time'] = pd.to_datetime(df['Open time'], unit='ms')
    
    # Set the date as the index
    df.set_index('Open time', inplace=True)

    # Define the market profile
    mpf.make_marketcolors(up='tab:blue',down='tab:red',inherit=True)
    s  = mpf.make_mpf_style(base_mpf_style='charles',rc={'figure.facecolor':'white'})
    
    # Define the plot
    mpf.plot(df,type='line',style=s,volume=True)

# Get one day of data
data = dataset1.__getitem__(1)

# Plot the chart
plot_chart(data)



##### Set Dataset and DataLoader
* Send Dataset and DataLoader to GPU for faster Calculation
* Make Batch for Dataloader
* Devide Training Set and Testing Set

In [None]:
def sliding_window_fn(batch):
    windows = []
    for tensor in batch:
        for i in range(tensor.shape[0] - 30 + 1):  # Create windows of 30 rows each
            windows.append(tensor[i:i+30])
    return torch.stack(windows)



# Create the dataset
dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')

# Shuffle the dataset indices
indices = list(range(len(dataset)))
random.shuffle(indices)

# Split the indices into training and test sets
split_idx = int(0.8 * len(indices))
train_indices, test_indices = indices[:split_idx], indices[split_idx:]

# Create data subsets using the indices
train_data = Subset(dataset, train_indices)
test_data = Subset(dataset, test_indices)

# Create the data loaders
train_loader = DataLoader(train_data, batch_size=1, collate_fn=sliding_window_fn, shuffle=False, drop_last=True)
test_loader = DataLoader(test_data, batch_size=1, collate_fn=sliding_window_fn, shuffle=False, drop_last=True)

In [None]:
# To view first batch of the training data loader
for batch_idx, batch in enumerate(train_loader):
    print(f'Batch index: {batch_idx}, Batch tensor shape: {batch.shape}')
    break  # To prevent printing all the data

# To view first batch of the test data loader
for batch_idx, batch in enumerate(test_loader):
    print(f'Batch index: {batch_idx}, Batch tensor shape: {batch.shape}')
    break  # To prevent printing all the data





##### Build LSTM Model

In [None]:
class StackedLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(StackedLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # The LSTM layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # The final Fully-Connected layer that gives us our predictions
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Layer to output probabilities
        self.probability = nn.Softmax(dim=-1)
        
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)

        # Initialize cell state with zeros
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)

        # Get the outputs and the new hidden state from the LSTM
        out, (hn, cn) = self.lstm(x, (h0, c0))
        
        # Index hidden state of last time step
        out = self.fc(out[:, -5:, :])  # out.size() --> batch_size, output_dim

        # Get probabilities
        probs = self.probability(out)
        
        return out, probs



#### Train LSTM Model

In [None]:
# Model, Loss, and Optimizer
input_dim = 5   # Number of input features (excluding time)
hidden_dim = 150   # Number of hidden neurons in the LSTM layers
layer_dim = 5   # Number of stacked LSTM layers
output_dim = 5  # Number of output features (excluding time)

model = StackedLSTM(input_dim, hidden_dim, layer_dim, output_dim)
model = model.to(device)

criterion = nn.MSELoss()  # Use mean square error loss for regression problem
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Use Adam optimizer

# Set hyperparameters
epochs = 1

# loop over epochs
for epoch in range(epochs):

    # loop over the data loader
    for i, batch in enumerate(train_loader):
        features = batch[:, :, 1:].to(device)

        # forward pass: compute predicted outputs by passing inputs to the model
        output, probs = model(features)

        # targets for the last 5 timesteps (excluding time feature)
        targets = features[:, -5:, :]

        # calculate the loss
        loss = criterion(output, targets)

        # zero the gradients
        optimizer.zero_grad()

        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()

        # perform a single optimization step (parameter update)
        optimizer.step()

        # output training information
        if i % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


#### Model Evaluation

In [None]:
model.eval()  # set the model to evaluation mode

# Choose a sample from your test dataset
sample = test_data[0].unsqueeze(0)[:, :, 1:].to(device)  # Unsqueeze to add a batch dimension and exclude the time feature

# Get the model's predictions
with torch.no_grad():
    predictions, probabilities = model(sample)


#### Evaluation function using plt

In [None]:
def evaluate_model(model, dataset, indices):
    for index in indices:
        # Get data for one day
        data = dataset[index]
        
        # Separate the features and targets
        prices = data[:, 1:5].numpy()
        volumes = data[:, -1].numpy()

        # Create DataFrame for actual data
        df_actual = pd.DataFrame(prices, columns=['Open', 'High', 'Low', 'Close'])
        df_actual['Volume'] = volumes
        df_actual.index = pd.to_datetime(data[:, 0].numpy(), unit='ms')
        
        # Create DataFrame for predictions
        df_predicted = df_actual.copy()
        
        for i in range(30, len(data)):
            # Get the last 30 minutes of data
            input_data = torch.from_numpy(data[i-30:i, 1:].numpy()).float().unsqueeze(0)
            
            # Get the predicted price from the model
            prediction, _ = model(input_data.to(device))
            prediction = prediction.squeeze().detach().cpu().numpy()[-1, :]

            
            # Store the predicted price
            df_predicted.iloc[i, :5] = prediction


            
        # Plot actual and predicted prices
        fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, gridspec_kw={'height_ratios': [2, 1]})
        
        mpf.plot(df_actual, type='line', ax=axes[0], volume=axes[1], title=f'Actual prices for day {i}')
        plt.show()
        
        fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, gridspec_kw={'height_ratios': [2, 1]})
        mpf.plot(df_predicted, type='line', ax=axes[0], volume=axes[1], title=f'Predicted prices for day {i}')
        plt.show()

# Pick 5 random days from the test data
indices = random.sample(range(len(test_data)), 5)

# Evaluate the model
evaluate_model(model, dataset, indices)



#### Evaulation function using mpf

In [None]:
def evaluate_model(model, dataset, indices):
    # Get the start date from the dataset
    start_date_str = dataset.dates[0]
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')

    for index in indices:
        # Calculate the date
        date = start_date + timedelta(days=index)

        # Get data for one day
        data = dataset[index]
        
        # Separate the features and targets
        prices = data[:, 1:5].numpy()
        volumes = data[:, -1].numpy()

        # Create DataFrame for actual data
        df_actual = pd.DataFrame(prices, columns=['Open', 'High', 'Low', 'Close'])
        df_actual['Volume'] = volumes
        df_actual.index = pd.to_datetime(data[:, 0].numpy(), unit='ms')
        
        # Create DataFrame for predictions
        df_predicted = df_actual.copy()
        
        # Remove the first 30 rows from the predicted DataFrame
        df_predicted = df_predicted.iloc[30:]

        for i in range(len(df_predicted)):
            # Get the last 30 minutes of data
            input_data = torch.from_numpy(data[i:i+30, 1:].numpy()).float().unsqueeze(0)
            
            # Get the predicted price from the model
            prediction, _ = model(input_data.to(device))
            prediction = prediction.squeeze().detach().cpu().numpy()[-1, :]

            # Store the predicted price
            df_predicted.iloc[i, :5] = prediction

        # Prepare the actual and predicted 'Close' prices for plotting
        plot_actual = df_actual['Close'].iloc[30:]
        plot_predicted = df_predicted['Close']

        # Create the addplot object
        addplot = [mpf.make_addplot(plot_actual, panel=0, color='b', secondary_y=False),
                   mpf.make_addplot(plot_predicted, panel=0, color='r', secondary_y=False)]
        
        # Define the market profile
        mpf.make_marketcolors(up='tab:blue', down='tab:red', inherit=True)
        s = mpf.make_mpf_style(base_mpf_style='charles', rc={'figure.facecolor': 'white'})

        # Plot the prices
        mpf.plot(df_actual, type='line', style=s, volume=True, title=f'Day {date.strftime("%Y-%m-%d")}', ylabel='Price', addplot=addplot)
        
# Pick 5 random days from the test data
indices = random.sample(range(len(test_data)), 5)

# Evaluate the model
evaluate_model(model, dataset, indices)
