In [1]:
import torch
import pandas as pd
from datetime import datetime as dt
from torch.utils.data import Dataset, DataLoader
import os

class PriceDataset(Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str, input_window_size, target_window_size):
        self.directory = f'C:/Github/DL-FinalProject/csvfiles/{item}/'
        self.input_window_size = input_window_size
        self.target_window_size = target_window_size
        self.columns = [1, 4]
        self.data = self.load_data(start_date_str, end_date_str)

    def load_data(self, start_date_str, end_date_str):
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        all_data = []

        for filename in os.listdir(self.directory):
            # Extract date from filename
            file_date_str = '-'.join(filename.split('-')[2:]).split('.')[0]
            file_date = dt.strptime(file_date_str, '%Y-%m-%d').date()

            if start_date <= file_date <= end_date:
                file_path = os.path.join(self.directory, filename)
                df = pd.read_csv(file_path, usecols=self.columns)
                all_data.append(df)

        return pd.concat(all_data, ignore_index=True)

    def __len__(self):
        return len(self.data) - self.input_window_size - self.target_window_size + 1

    def __getitem__(self, idx):
        if idx + self.input_window_size + self.target_window_size > len(self.data):
            raise IndexError("Index out of bounds")

        window_data = self.data.iloc[idx:idx + self.input_window_size + self.target_window_size]
        open_prices = window_data.iloc[:, 0]  # Assuming 1st column is 'open' prices
        close_prices = window_data.iloc[:, 1]  # Assuming 4th column is 'close' prices
        percentage_changes = ((close_prices - open_prices) * 100 / open_prices)
        input_data = torch.tensor(percentage_changes.values[:self.input_window_size], dtype=torch.float32)
        target_data = torch.tensor(percentage_changes.values[self.input_window_size:], dtype=torch.float32)
        return input_data, target_data

In [2]:
# hyperparameters for dataset and dataloader
input_size = 10
target_size = 5
batch_size = 1

In [3]:
# Example usage
dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2021-12-01', input_window_size=input_size, target_window_size=target_size)

# DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [5]:
import matplotlib.pyplot as plt

# # Visualizing the Data Distribution
# # Plotting the percentage changes histogram
# all_percentage_changes = []
# for data in dataset:
#     all_percentage_changes.extend(data.tolist())

# plt.figure(figsize=(10, 6))
# plt.hist(all_percentage_changes, bins=50, color='blue', alpha=0.7)
# plt.title('Distribution of Percentage Changes')
# plt.xlabel('Percentage Change')
# plt.ylabel('Frequency')
# plt.show()

# # Plotting the line chart of prices
# plt.figure(figsize=(10, 6))
# plt.plot(dataset.data.iloc[:, 1], label='Close Price')  # Assuming second column is 'close' prices
# plt.title('Close Prices Over Time')
# plt.xlabel('Time')
# plt.ylabel('Price')
# plt.legend()
# plt.show()


# Printing Dataset Samples
print("Dataset Samples:")
for i, data in enumerate(dataset):
    if i < 5:  # Print first 5 samples
        print(f"Sample {i}: {data}")
    if i == 5:
        break

# Printing DataLoader Batches
print("\nDataLoader Batches:")
for i, batch in enumerate(dataloader):
    if i < 5:  # Print first 5 batches
        print(f"Batch {i}: {batch}")
    if i == 5:
        break

Dataset Samples:
Sample 0: tensor([ 0.2283, -0.5014, -0.2021, -0.0092, -0.1460,  0.0345,  0.1332,  0.4663,
         0.0309, -0.1609,  0.1494,  0.0324, -0.0498,  0.4723,  0.2650])
Sample 1: tensor([-0.5014, -0.2021, -0.0092, -0.1460,  0.0345,  0.1332,  0.4663,  0.0309,
        -0.1609,  0.1494,  0.0324, -0.0498,  0.4723,  0.2650,  0.5988])
Sample 2: tensor([-0.2021, -0.0092, -0.1460,  0.0345,  0.1332,  0.4663,  0.0309, -0.1609,
         0.1494,  0.0324, -0.0498,  0.4723,  0.2650,  0.5988,  0.6240])
Sample 3: tensor([-0.0092, -0.1460,  0.0345,  0.1332,  0.4663,  0.0309, -0.1609,  0.1494,
         0.0324, -0.0498,  0.4723,  0.2650,  0.5988,  0.6240, -0.0081])
Sample 4: tensor([-0.1460,  0.0345,  0.1332,  0.4663,  0.0309, -0.1609,  0.1494,  0.0324,
        -0.0498,  0.4723,  0.2650,  0.5988,  0.6240, -0.0081,  0.0191])

DataLoader Batches:
Batch 0: tensor([[ 0.2283, -0.5014, -0.2021, -0.0092, -0.1460,  0.0345,  0.1332,  0.4663,
          0.0309, -0.1609,  0.1494,  0.0324, -0.0498,  0.4723,