In [1]:
import torch
from torch.utils.data import DataLoader
from datetime import datetime as dt, timedelta
import pandas as pd
import os
import random
import numpy as np
import torch.nn as nn


# check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 42  # choose any seed you prefer
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

cuda


In [2]:
# Dataset parameters and Lstm hyperparameters
window_size = 100 # lstm input size

input_window_size = 100

target_window_size = 10 # lstm output size

hidden_size = 1000

num_layers = 4

dropout = 0.1

In [3]:
class PriceDatasetInput(torch.utils.data.Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str):
        self.directory = f'C:/Github/PricePrediction/csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [1, 4]  # Selecting open and close prices
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        filename = self.filenames[idx]
        df = pd.read_csv(filename, usecols=self.columns, header=None)
        tensor = torch.tensor(df.values, dtype=torch.float)  # Return open and close prices
        if tensor.size(0) > input_window_size:  # If the tensor is long enough
            tensor = tensor[:-input_window_size]  # Remove the last window_size elements
        return tensor

class PriceDatasetTarget(torch.utils.data.Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str):
        self.directory = f'C:/Github/PricePrediction/csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [1, 4]  # Selecting open and close prices
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        filename = self.filenames[idx]
        df = pd.read_csv(filename, usecols=self.columns, header=None)
        tensor = torch.tensor(df.values, dtype=torch.float)  # Return open and close prices
        if tensor.size(0) > input_window_size:  # If the tensor is long enough
            tensor = tensor[input_window_size:]  # Remove the last window_size elements
        return tensor



def sliding_window_percentage_input(batch):
    windows_percentage = []
    for tensor in batch:
        for i in range(tensor.shape[0] - input_window_size + 1):  # Create windows of size window_size
            window = tensor[i:i+input_window_size]
            pct_change = ((window[:, 1] - window[:, 0]) * 100 / window[:, 0])
            windows_percentage.append(pct_change)
    output_percentage = torch.stack(windows_percentage)

    return output_percentage

def sliding_window_binary_input(batch):
    windows_binary = []
    for tensor in batch:
        for i in range(tensor.shape[0] - input_window_size + 1):  # Create windows of size window_size
            window = tensor[i:i+input_window_size]
            binary_change = (window[:, 1] > window[:, 0]).float()  # Calculate the binary change
            windows_binary.append(binary_change)
    output_binary = torch.stack(windows_binary)

    return output_binary


def sliding_window_percentage_target(batch):
    windows_percentage = []
    for tensor in batch:
        for i in range(tensor.shape[0] - target_window_size + 1):  # Create windows of size window_size
            window = tensor[i:i+target_window_size]
            pct_change = ((window[:, 1] - window[:, 0]) * 100 / window[:, 0])
            windows_percentage.append(pct_change)
    output_percentage = torch.stack(windows_percentage)

    return output_percentage

def sliding_window_binary_target(batch):
    windows_binary = []
    for tensor in batch:
        for i in range(tensor.shape[0] - target_window_size + 1):  # Create windows of size window_size
            window = tensor[i:i+target_window_size]
            binary_change = (window[:, 1] > window[:, 0]).float()  # Calculate the binary change
            windows_binary.append(binary_change)
    output_binary = torch.stack(windows_binary)

    return output_binary


# 인덱스 맞춰봐야함

In [4]:
train_dataset_input = PriceDatasetInput('BTCUSDT', '1m', '2021-03-01', '2023-04-30')
test_dataset_input = PriceDatasetInput('ETHUSDT', '1m', '2021-03-01', '2023-04-30')


percentage_train_loader_input = DataLoader(train_dataset_input, batch_size=1, collate_fn=sliding_window_percentage_input, shuffle=False, drop_last=True)
percentage_test_loader_input = DataLoader(test_dataset_input, batch_size=1, collate_fn=sliding_window_percentage_input, shuffle=False, drop_last=True)

binary_train_loader_input = DataLoader(train_dataset_input, batch_size=1, collate_fn=sliding_window_binary_input, shuffle=False, drop_last=True)
binary_test_loader_input = DataLoader(test_dataset_input, batch_size=1, collate_fn=sliding_window_binary_input, shuffle=False, drop_last=True)



train_dataset_target = PriceDatasetTarget('BTCUSDT', '1m', '2021-03-01', '2023-04-30')
test_dataset_target = PriceDatasetTarget('ETHUSDT', '1m', '2021-03-01', '2023-04-30')


percentage_train_loader_target = DataLoader(train_dataset_target, batch_size=1, collate_fn=sliding_window_percentage_target, shuffle=False, drop_last=True)
percentage_test_loader_target = DataLoader(test_dataset_target, batch_size=1, collate_fn=sliding_window_percentage_target, shuffle=False, drop_last=True)

binary_train_loader_target = DataLoader(train_dataset_target, batch_size=1, collate_fn=sliding_window_binary_target, shuffle=False, drop_last=True)
binary_test_loader_target = DataLoader(test_dataset_target, batch_size=1, collate_fn=sliding_window_binary_target, shuffle=False, drop_last=True)

In [5]:
class PercentagePrediction(nn.Module):
    def __init__(self):
        super(PercentagePrediction, self).__init__()
        self.lstm_pct = nn.LSTM(input_size = input_window_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.fc_pct = nn.Linear(in_features = hidden_size, out_features = target_window_size, dtype=torch.float)  # output layer for percentage prediction

    def forward(self, x, hidden):
        out, hidden = self.lstm_pct(x, hidden)
        out_pct = self.fc_pct(out)  # output for percentage prediction
        return out_pct, hidden

class BinaryPrediction(nn.Module):
    def __init__(self):
        super(BinaryPrediction, self).__init__()
        self.lstm_pct = nn.LSTM(input_size = input_window_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.fc_pct = nn.Linear(in_features = hidden_size, out_features = target_window_size, dtype=torch.float)  # output layer for binary prediction

    def forward(self, x, hidden):
        out, hidden = self.lstm_binary(x, hidden)
        out_binary = self.fc_binary(out)  # output for binary prediction
        return out_binary, hidden


In [6]:
def train(model, train_loader_input, train_loader_target, criterion, optimizer, device):
    model.train()
    total_batches = min(len(train_loader_input), len(train_loader_target))
    loss_sum = 0
    
    # Initialize hidden state
    h0 = torch.zeros(num_layers, hidden_size).to(device)
    c0 = torch.zeros(num_layers, hidden_size).to(device)
    hidden = (h0, c0)

    for i, (batch_input, batch_target) in enumerate(zip(train_loader_input, train_loader_target)):
        inputs = batch_input.to(device)
        targets = batch_target.to(device)

        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)  # Pass the hidden state to the model
        hidden = (hidden[0].detach(), hidden[1].detach())  # Detach the hidden state from its history
        outputs = outputs.squeeze()  # Remove the extra dimension from outputs

        loss = criterion(outputs, targets)
        loss_sum += loss.item()

        loss.backward()
        optimizer.step()

        if (i + 1) % 200 == 0:  # Print after every 200 batches
            avg_loss = loss_sum / (i+1)
            print(f"Training progress: [{i + 1}/{total_batches} Batches], Avg Loss: {avg_loss:.4f}")

    return loss_sum / total_batches




def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    total_batches = len(test_loader)  # Total number of batches
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            batch = batch.to(device)
            inputs = batch[:, :-1, :]
            percentage_targets = batch[:, 1:, 0]  # Get the percentage change targets
            binary_targets = batch[:, 1:, 1]  # Get the binary change targets

            percentage_targets = percentage_targets.reshape(-1)
            binary_targets = binary_targets.reshape(-1)

            percentage_outputs, binary_outputs = model(inputs)  # Get the two outputs
            loss = criterion(percentage_outputs, binary_outputs, percentage_targets, binary_targets)
            total_loss += loss.item()

            if (i + 1) % 200 == 0:  # Print after every 200 batches
                print(f"Testing progress: [{i + 1}/{total_batches} Batches]")
    return total_loss / len(test_loader)  # Return average loss

def train_and_evaluate(model, modelname, train_loader, test_loader, criterion, optimizer, epochs, device):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        train(model, train_loader, criterion, optimizer, device)
        val_loss = evaluate(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} \t {modelname} \t Validation Loss: {val_loss:.10f}")

        # Save the model if the validation loss is the best we've seen so far.
        if val_loss < best_val_loss:
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'models/{str(modelname)}.pth')
            best_val_loss = val_loss

In [7]:
model = PercentagePrediction().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

criterion = nn.MSELoss()

epochs = 2

for i in range(epochs):
    train(model = model, train_loader_input=percentage_train_loader_input, train_loader_target=percentage_train_loader_target, criterion=criterion, optimizer=optimizer, device = device)

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (1241) must match the size of tensor b (1331) at non-singleton dimension 0

In [None]:
model = PriceChangePrediction().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = CustomCriterion().to(device)

try: 
    # Load the saved models and optimizers
    checkpoint = torch.load('C:/Github/PricePrediction/docker/models/combined_model.pth')

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

except FileNotFoundError:
    print("No checkpoint found. Starting from scratch.")
    hidden = (torch.zeros(num_layers, hidden_size).to(device), torch.zeros(num_layers, hidden_size).to(device))  # need to write code for initializing hidden state tensor

epochs = 1  # or any other number you prefer

train_and_evaluate(model, 'combined_model', train_loader, test_loader, criterion, optimizer, epochs, device)


# ------ Printing Tensors -------

In [7]:
"""
# Size of the Dataset
print(f'Train dataset size: {train_dataset.__getitem__(1)}')
print(f'Test dataset size: {len(train_dataset)}')

# Size of the DataLoader (i.e., number of batches)
print(f'Train dataloader size: {len(binary_train_loader)}')
print(f'Test dataloader size: {len(binary_test_loader)}')

# Size of the tensor output by the Dataset
sample_tensor = train_dataset[0]
print(f'Shape of the tensor output by train_dataset: {sample_tensor.dtype}')
"""
# Size of the tensor output by the DataLoader
for batch in binary_train_loader_input:
    print(f'Shape of the tensor output by train_loader: {batch.size(0)}')
    print(batch[103, :])
    break  # we break after the first batch


for batch in binary_train_loader_target:
    print(f'Shape of the tensor output by train_loader: {batch.dtype}')
    print(batch[3, :])
    break  # we break after the first batch


Shape of the tensor output by train_loader: 1241
tensor([0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1.,
        0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
        1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 1., 1., 1., 1., 0., 0., 0., 0.])
Shape of the tensor output by train_loader: torch.float32
tensor([0., 1., 1., 0., 1., 1., 0., 1., 0., 1.])


# ------ Old Codes ------

In [None]:
def train(model, train_loader, criterion, optimizer, device, feature):
    model.train()
    total_batches = len(train_loader)
    for i, batch in enumerate(train_loader):
        for j in range(batch.shape[0]): # iterate through the first dimension
            inputs = batch[j, :, feature].unsqueeze(0).to(device) # add an extra dimension to match the model's expected input shape
            percentage_targets = batch[j, 1:, 0].reshape(-1) # Get the percentage change targets
            binary_targets = batch[j, 1:, 1].reshape(-1) # Get the binary change targets

            optimizer.zero_grad()
            percentage_outputs, binary_outputs = model(inputs)
            loss = criterion(percentage_outputs, binary_outputs, percentage_targets, binary_targets)
            loss.backward()
            optimizer.step()

        if (i + 1) % 200 == 0:
            print(f"Training progress: [{i + 1}/{total_batches} Batches]")

In [None]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_batches = len(train_loader)  # Total number of batches
    for i, batch in enumerate(train_loader):  # Use enumerate to get the index (i)
        for j in range(batch.size(0)):
            batch = batch.to(device)
            inputs = batch[j, :]
            percentage_targets = batch[:, 1:, 0]  # Get the percentage change targets
            binary_targets = batch[:, 1:, 1]  # Get the binary change targets

            percentage_targets = percentage_targets.reshape(-1)
            binary_targets = binary_targets.reshape(-1)

            optimizer.zero_grad()
            percentage_outputs, binary_outputs = model(inputs)  # Get the two outputs
            loss = criterion(percentage_outputs, binary_outputs, percentage_targets, binary_targets)
            loss.backward()
            optimizer.step()

        if (i + 1) % 200 == 0:  # Print after every 200 batches
            print(f"Training progress: [{i + 1}/{total_batches} Batches]")

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            inputs = batch[:, :-1, :]
            percentage_targets = batch[:, 1:, 0]  # Get the percentage change targets
            binary_targets = batch[:, 1:, 1]  # Get the binary change targets

            percentage_targets = percentage_targets.reshape(-1)
            binary_targets = binary_targets.reshape(-1)

            percentage_outputs, binary_outputs = model(inputs)  # Get the two outputs
            loss = criterion(percentage_outputs, binary_outputs, percentage_targets, binary_targets)
            total_loss += loss.item()
    return total_loss / len(test_loader)  # Return average loss

def train_and_evaluate(model, modelname, train_loader, test_loader, criterion, optimizer, epochs, device):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        train(model, train_loader, criterion, optimizer, device)
        val_loss = evaluate(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} \t {modelname} \t Validation Loss: {val_loss:.10f}")

        # Save the model if the validation loss is the best we've seen so far.
        if val_loss < best_val_loss:
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'models/{str(modelname)}.pth')
            best_val_loss = val_loss

In [None]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_batches = len(train_loader)  # Total number of batches
    for i, batch in enumerate(train_loader):  # Use enumerate to get the index (i)
        for j in range(batch.size(0)):
            batch = batch.to(device)
            inputs = batch[j, :]
            targets = batch[j+window_size, :]  # Get the targets

            optimizer.zero_grad()
            outputs = model(inputs)  # Get the two outputs
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        if (i + 1) % 200 == 0:  # Print after every 200 batches
            print(f"Training progress: [{i + 1}/{total_batches} Batches]")

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    total_batches = len(test_loader)  # Total number of batches
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            batch = batch.to(device)
            inputs = batch[:, :-1, :]
            percentage_targets = batch[:, 1:, 0]  # Get the percentage change targets
            binary_targets = batch[:, 1:, 1]  # Get the binary change targets

            percentage_targets = percentage_targets.reshape(-1)
            binary_targets = binary_targets.reshape(-1)

            percentage_outputs, binary_outputs = model(inputs)  # Get the two outputs
            loss = criterion(percentage_outputs, binary_outputs, percentage_targets, binary_targets)
            total_loss += loss.item()

            if (i + 1) % 200 == 0:  # Print after every 200 batches
                print(f"Testing progress: [{i + 1}/{total_batches} Batches]")
    return total_loss / len(test_loader)  # Return average loss

def train_and_evaluate(model, modelname, train_loader, test_loader, criterion, optimizer, epochs, device):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        train(model, train_loader, criterion, optimizer, device)
        val_loss = evaluate(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} \t {modelname} \t Validation Loss: {val_loss:.10f}")

        # Save the model if the validation loss is the best we've seen so far.
        if val_loss < best_val_loss:
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'models/{str(modelname)}.pth')
            best_val_loss = val_loss

In [None]:
class CustomCriterion(nn.Module):
    def __init__(self, weights=(1.0, 1.0)):
        super().__init__()
        self.loss_fn_pct = nn.MSELoss()
        self.loss_fn_binary = nn.BCEWithLogitsLoss()
        self.weights = weights

    def forward(self, percentage_outputs, binary_outputs, percentage_targets, binary_targets):
        loss_pct = self.loss_fn_pct(percentage_outputs, percentage_targets)
        loss_binary = self.loss_fn_binary(binary_outputs, binary_targets)
        return self.weights[0] * loss_pct + self.weights[1] * loss_binary


In [9]:
test = []
for i in range(100):
    test.append(i)

print(test)
print(test[80:])
print(test[:80])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
