In [19]:
import torch
from torch.utils.data import DataLoader
from datetime import datetime as dt, timedelta
import pandas as pd
import os
import random
import numpy as np
import torch.nn as nn


# check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 42  # choose any seed you prefer
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

cuda


In [20]:
# Dataset parameters and Lstm hyperparameters
window_size = 100 # lstm input size

input_window_size = 100

target_window_size = 10 # lstm output size

hidden_size = 1000

num_layers = 4

dropout = 0.1

input sequence: 100 <- 조절 가능하게 해야함. 그리고 input sequence는 가격 값을 input 받음

output sequence: 10 <- 이것 또한 조절 가능하게 해야함. 그리고 output sequence는 상승/하락 binary를 받음

dataset v2는 전체 csv file을 합친 후에 dataloader에 전달하려고 한 것인데, 메모리 문제로 load하지 못하는 문제가 있었음.

그래서 v3에서는 날짜마다 load 하되, 한 날의 마지막 부분에 data가 짤리는 경우에는 다음 날짜가 존재하는지 확인한 뒤에, 뒤의 data를 가져와서 dataloader에 보내는 것으로 결정.

가격 자체가 중요한 것이 아니기 때문에 가격 변동값을 input으로 넣을 것임.

In [21]:
class PriceDataset(torch.utils.data.Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str):
        self.directory = f'C:/Github/DL-FinalProject/csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [1, 4]  # Selecting open and close prices
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def __len__(self):
        total_length = 0
        for filename in self.filenames:
            df = pd.read_csv(filename, usecols=self.columns, header=None)
            total_length += len(df)

        # Adjust for the fact that the last few entries in the dataset may not form a complete window
        return max(0, total_length - input_window_size - target_window_size + 1)
    
    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __getitem__(self, idx):
        start_idx = max(0, idx - input_window_size - target_window_size + 1)
        end_idx = min(idx + 1, len(self.filenames))

        tensors = []
        for file_idx in range(start_idx, end_idx):
            filename = self.filenames[file_idx]
            df = pd.read_csv(filename, usecols=self.columns, header=None)
            tensor = torch.tensor(df.values, dtype=torch.float)
            tensors.append(tensor)

        combined_tensor = torch.cat(tensors, dim=0)
        return combined_tensor


def sliding_window_percentage(batch):
    windows_percentage = []
    for tensor in batch:
        total_length = tensor.shape[0]
        for i in range(total_length - input_window_size - target_window_size + 1):
            window = tensor[i:i + input_window_size + target_window_size]
            pct_change = ((window[-target_window_size:, 1] - window[:input_window_size, 0]) * 100 / window[:input_window_size, 0])
            windows_percentage.append(pct_change)

    output_percentage = torch.stack(windows_percentage)
    return output_percentage

In [22]:
train_dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)

test_dataset = PriceDataset('ETHUSDT', '1m', '2021-03-01', '2023-04-30')
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=True)

In [23]:
def count_total_windows(dataset):
    total_length = 0
    for filename in dataset.filenames:
        df = pd.read_csv(filename, usecols=dataset.columns, header=None)
        total_length += len(df)

    # Adjust for the fact that the last few entries in the dataset may not form a complete window
    total_windows = max(0, total_length - input_window_size - target_window_size + 1)
    return total_windows

# Example usage
total_train_windows = count_total_windows(train_dataset)
total_test_windows = count_total_windows(test_dataset)

print(f"Total windows in train dataset: {total_train_windows}")
print(f"Total windows in test dataset: {total_test_windows}")


Total windows in train dataset: 1137937
Total windows in test dataset: 1137937


In [28]:
# Assuming PriceDataset is defined and instantiated
# Example: price_dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')

for i in range(len(train_dataset)):
    data = train_dataset[i]
    print(f"Data at index {i}:\n", data)
    print(f"Length of index {i}:\n", len(data))

    # Optional: Break the loop after a few iterations to avoid too much output
    if i >= 10:  # for example, only print the first 10 entries
        break


Data at index 0:
 tensor([[45134.1094, 45260.7383],
        [45252.6719, 45356.0000],
        [45356.0000, 45128.5703],
        ...,
        [49622.9414, 49590.7695],
        [49590.7812, 49598.2891],
        [49600.0000, 49587.0312]])
Length of index 0:
 1440
Data at index 1:
 tensor([[45134.1094, 45260.7383],
        [45252.6719, 45356.0000],
        [45356.0000, 45128.5703],
        ...,
        [48271.4297, 48282.4688],
        [48284.5703, 48305.5898],
        [48305.5898, 48440.6484]])
Length of index 1:
 2880
Data at index 2:
 tensor([[45134.1094, 45260.7383],
        [45252.6719, 45356.0000],
        [45356.0000, 45128.5703],
        ...,
        [50499.6914, 50484.4219],
        [50484.4219, 50446.0117],
        [50441.3789, 50349.3711]])
Length of index 2:
 4320
Data at index 3:
 tensor([[45134.1094, 45260.7383],
        [45252.6719, 45356.0000],
        [45356.0000, 45128.5703],
        ...,
        [48536.7305, 48496.2617],
        [48496.2500, 48466.9492],
        [48466.9

In [30]:
# Assuming train_loader is a DataLoader object
for batch in train_loader:
    print(batch)
    break  # Exit after the first batch


RuntimeError: torch.cat(): expected a non-empty list of Tensors

In [24]:
# Creating a simple function to fetch and print the first batch of data from the DataLoader

def print_first_data_batch(data_loader):
    # Fetch the first batch of data
    for data in data_loader:
        print("First batch of data:", data)
        break  # We only want the first batch, so break after fetching it

# Using the function on the train_loader
print_first_data_batch(train_loader)



RuntimeError: torch.cat(): expected a non-empty list of Tensors