In [4]:
#----------------------------------------------------------------------
import torch
from torch.utils.data import DataLoader
from datetime import datetime as dt, timedelta
import pandas as pd
import os
import random
import numpy as np
import torch.nn as nn


# check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 42  # choose any seed you prefer
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

#----------------------------------------------------------------------
# Dataset parameters and Lstm hyperparameters
window_size = 100 # lstm input size

input_window_size = 100

target_window_size = 10 # lstm output size

hidden_size = 1000

num_layers = 4

dropout = 0.1

#----------------------------------------------------------------------
class PriceDataset(torch.utils.data.Dataset):
    def __init__(self, item, timespan, start_date_str, end_date_str):
        self.directory = f'C:/Github/DL-FinalProject/csvfiles/{item}'
        self.item = item
        self.timespan = timespan
        start_date = dt.strptime(start_date_str, '%Y-%m-%d').date()
        end_date = dt.strptime(end_date_str, '%Y-%m-%d').date()
        self.dates = [single_date.strftime("%Y-%m-%d") for single_date in self.daterange(start_date, end_date)]
        self.columns = [1, 4]  # Selecting open and close prices
        self.filenames = self.get_filenames()

    def daterange(self, start_date, end_date):
        for n in range(int((end_date - start_date).days) + 1):
            yield start_date + timedelta(n)

    def __len__(self):
        total_length = 0
        for filename in self.filenames:
            df = pd.read_csv(filename, usecols=self.columns, header=None)
            total_length += len(df)

        # Adjust for the fact that the last few entries in the dataset may not form a complete window
        return max(0, total_length - input_window_size - target_window_size + 1)
    
    def get_filenames(self):
        filenames = []
        for date in self.dates:
            filename = f"{self.directory}/{self.item}-{self.timespan}-{date}.csv"
            if os.path.exists(filename):
                filenames.append(filename)
        return filenames

    def __getitem__(self, idx):
        # Ensure index is within bounds
        if idx < 0 or idx >= len(self):
            raise IndexError("Index out of bounds")

        # Calculate start and end indices for files to read
        start_idx = idx
        end_idx = idx + input_window_size + target_window_size

        # Adjust if the end index goes beyond the dataset
        if end_idx > len(self.filenames):
            end_idx = len(self.filenames)

        tensors = []
        for file_idx in range(start_idx, end_idx):
            filename = self.filenames[file_idx]
            if os.path.exists(filename):
                df = pd.read_csv(filename, usecols=self.columns, header=None)
                if not df.empty:
                    tensor = torch.tensor(df.values, dtype=torch.float)
                    tensors.append(tensor)

        # Check if tensors list is empty
        if not tensors:
            raise ValueError(f"No data found for index {idx}")

        combined_tensor = torch.cat(tensors, dim=0)

        # Here, ensure that combined_tensor has the expected shape
        # and adjust as necessary

        return combined_tensor


def sliding_window_percentage(batch):
    windows_percentage = []
    for tensor in batch:
        total_length = tensor.shape[0]
        for i in range(total_length - input_window_size - target_window_size + 1):
            window = tensor[i:i + input_window_size + target_window_size]
            pct_change = ((window[-target_window_size:, 1] - window[:input_window_size, 0]) * 100 / window[:input_window_size, 0])
            windows_percentage.append(pct_change)

    output_percentage = torch.stack(windows_percentage)
    return output_percentage

#----------------------------------------------------------------------
train_dataset = PriceDataset('BTCUSDT', '1m', '2021-03-01', '2023-04-30')
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)

test_dataset = PriceDataset('ETHUSDT', '1m', '2021-03-01', '2023-04-30')
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=True)

#----------------------------------------------------------------------
def count_total_windows(dataset):
    total_length = 0
    for filename in dataset.filenames:
        df = pd.read_csv(filename, usecols=dataset.columns, header=None)
        total_length += len(df)

    # Adjust for the fact that the last few entries in the dataset may not form a complete window
    total_windows = max(0, total_length - input_window_size - target_window_size + 1)
    return total_windows

# Example usage
total_train_windows = count_total_windows(train_dataset)
total_test_windows = count_total_windows(test_dataset)

print(f"Total windows in train dataset: {total_train_windows}")
print(f"Total windows in test dataset: {total_test_windows}")




cuda
Total windows in train dataset: 1137937
Total windows in test dataset: 1137937


In [5]:
import torch
import torch.nn as nn
import math

# 포지셔널 인코딩 클래스
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# 시계열 트랜스포머 모델 클래스
class TimeSeriesTransformerModel(nn.Module):
    def __init__(self, num_features, d_model, n_heads, num_encoder_layers, d_ff, dropout_rate, lstm_hidden_size, num_lstm_layers):
        super(TimeSeriesTransformerModel, self).__init__()
        self.d_model = d_model

        # LSTM Layer
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=lstm_hidden_size, num_layers=num_lstm_layers, batch_first=True)
        
        # Linear layer to transform LSTM output to match Transformer d_model size
        self.linear1 = nn.Linear(lstm_hidden_size, d_model)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout_rate)
        
        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        
        # Output layer
        self.out = nn.Linear(d_model, 1)

    def forward(self, src):
        # LSTM layer
        lstm_out, _ = self.lstm(src)

        # Transform LSTM output to match Transformer d_model size
        src = self.linear1(lstm_out) * math.sqrt(self.d_model)

        # Positional Encoding and Transformer
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        
        # Using the last time step's output
        output = torch.sigmoid(self.out(output[-1]))
        
        return output


In [6]:
import torch.optim as optim

# Initialize the model
num_features = 2  # Based on your dataset, adjust accordingly
d_model = 512  # Transformer's feature size
n_heads = 8  # Number of heads in multi-head attention
num_encoder_layers = 6  # Number of encoder layers in the transformer
d_ff = 2048  # Dimension of the feedforward network
dropout_rate = 0.1
lstm_hidden_size = 1000
num_lstm_layers = 4

model = TimeSeriesTransformerModel(num_features, d_model, n_heads, num_encoder_layers, d_ff, dropout_rate, lstm_hidden_size, num_lstm_layers)
model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Loss Function
criterion = nn.MSELoss()

# Training Loop
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    total_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

    avg_loss = total_loss / len(train_loader)
    print(f'====> Epoch: {epoch} Average loss: {avg_loss:.4f}')
    return avg_loss

# Testing Loop
def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            output = model(data)
            test_loss += criterion(output, data).item()

    test_loss /= len(test_loader)
    print(f'====> Test set loss: {test_loss:.4f}')
    return test_loss

# Training and Testing
num_epochs = 2
for epoch in range(1, num_epochs + 1):
    train_loss = train(model, device, train_loader, optimizer, criterion, epoch)
    test_loss = test(model, device, test_loader, criterion)



ValueError: No data found for index 460982