In [7]:
import pandas as pd
import torch
import os
from datetime import datetime as dt, timedelta
from torch.utils.data import DataLoader


class PriceDataset(torch.utils.data.Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str, input_window_size, target_window_size):
        self.directory = f'C:/Github/DL-FinalProject/csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        self.input_window_size = input_window_size
        self.target_window_size = target_window_size
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [1, 4]  # Selecting open and close prices
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def __len__(self):
        total_length = 0
        for filename in self.filenames:
            df = pd.read_csv(filename, usecols=self.columns, header=None)
            total_length += len(df)
        return max(0, total_length - self.input_window_size - self.target_window_size + 1)
    
    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __getitem__(self, idx):
        start_idx = max(0, idx - self.input_window_size - self.target_window_size + 1)
        end_idx = min(idx + 1, len(self.filenames))

        tensors = []
        for file_idx in range(start_idx, end_idx):
            filename = self.filenames[file_idx]
            df = pd.read_csv(filename, usecols=self.columns, header=None)
            tensor = torch.tensor(df.values, dtype=torch.float)
            tensors.append(tensor)

        combined_tensor = torch.cat(tensors, dim=0)
        if combined_tensor.shape[0] >= self.input_window_size + self.target_window_size:
            window = combined_tensor[:self.input_window_size + self.target_window_size]
            pct_change = ((window[-self.target_window_size:, 1] - window[:self.input_window_size, 0]) * 100 / window[:self.input_window_size, 0])
            return pct_change
        else:
            return torch.tensor([])  # Return an empty tensor if there isn't enough data

dataset v4는 sliding window가 백분율을 계산하는 것이 아니라 데이터셋 자체가 백분율을 계산할 수 있게 만듦. 그러나 실패

그래서 완전히 v5으로 넘어가서 dataset을 갈아엎을 예정

In [8]:
# Dataset parameters and Lstm hyperparameters
window_size = 100 # lstm input size

input_window_size = 100

target_window_size = 10 # lstm output size

hidden_size = 1000

num_layers = 4

dropout = 0.1

In [9]:
train_dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30', input_window_size=input_window_size, target_window_size=target_window_size)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)

test_dataset = PriceDataset('ETHUSDT', '1m', '2021-03-01', '2023-04-30', input_window_size=input_window_size, target_window_size=target_window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=True)

In [10]:
def count_total_windows(dataset):
    total_length = 0
    for filename in dataset.filenames:
        df = pd.read_csv(filename, usecols=dataset.columns, header=None)
        total_length += len(df)

    # Adjust for the fact that the last few entries in the dataset may not form a complete window
    total_windows = max(0, total_length - input_window_size - target_window_size + 1)
    return total_windows

# Example usage
total_train_windows = count_total_windows(train_dataset)
total_test_windows = count_total_windows(test_dataset)

print(f"Total windows in train dataset: {total_train_windows}")
print(f"Total windows in test dataset: {total_test_windows}")


Total windows in train dataset: 1137937
Total windows in test dataset: 1137937


In [21]:
print(train_dataset.__getitem__(idx=1))

RuntimeError: The size of tensor a (10) must match the size of tensor b (100) at non-singleton dimension 0

In [12]:
# Assuming PriceDataset is defined and instantiated
# Example: price_dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')

for i in range(len(train_dataset)):
    data = train_dataset[i]
    print(f"Data at index {i}:\n", data)
    print(f"Length of index {i}:\n", len(data))

    # Optional: Break the loop after a few iterations to avoid too much output
    if i >= 10:  # for example, only print the first 10 entries
        break


RuntimeError: The size of tensor a (10) must match the size of tensor b (100) at non-singleton dimension 0