In [19]:
import torch
import torch.nn as nn

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from copy import deepcopy

In [21]:
df = pd.read_csv('dataset/AMZN.csv', index_col='Date', parse_dates=True)
df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-03,47.47,47.85,46.25,47.58,7582127,AMZN
2006-01-04,47.48,47.73,46.69,47.25,7440914,AMZN
2006-01-05,47.16,48.2,47.11,47.65,5417258,AMZN
2006-01-06,47.97,48.58,47.32,47.87,6154285,AMZN
2006-01-09,46.55,47.1,46.4,47.08,8945056,AMZN


### ✨MinMaxScaler 적용

In [22]:
from sklearn.preprocessing import MinMaxScaler

raw_data = deepcopy(df.drop(['Name'], axis=1))
X_scaler = MinMaxScaler(feature_range=(-1,1))
y_scaler = MinMaxScaler(feature_range=(-1,1))
X_raw_data = X_scaler.fit_transform(raw_data)   # 시가 ~ 종가
y_raw_data = y_scaler.fit_transform(raw_data['Close'].values.reshape(-1,1)) # 종가
print(X_raw_data.shape, y_raw_data.shape)

(3019, 5) (3019, 1)


### ✨X_train 과 y_train 데이터 구성

In [23]:
def prepare_xy(X_raw_data, y_raw_data, lookback):
    data = list()
    for index in range(len(X_raw_data)-lookback):
        data.append(X_raw_data[index:index+lookback])
    data = np.array(data)
    return data, y_raw_data[lookback:]

In [24]:
lookback = 10 # sequence length - 1
X_train, y_train = prepare_xy(X_raw_data, y_raw_data, lookback)
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
print(X_train.shape, y_train.shape)

torch.Size([3009, 10, 5]) torch.Size([3009, 1])


### ✨Custom Dataet
> raw data를 텐서로 만들어, 전체 데이터를 딥러닝 모델에 한번에 넣는 경우는 없으므로, 미니배치를 쉽게 구성하고, 반복문을 통해, 미니 배치 하나씩 가져오도록 하는 구성이 필요함. 이를 손쉽게 해주는 기능이 Subset, DataLoader 등의 기능이며, 이를 위해, raw data를 Dataset으로 만들어주어야 함

### ✨Custom Dataset class

- torch.utils.data.Dataset 클래스 상속
- 다음 세가지 메서드가 필요
    - _ _init_ _(self) : 입력 데이터(x)와 실제 값(y)을 선언해주는 메서드
    - _ _len_ _(self) : 입력 데이터(x)와 실제 값(y) 길이를 리턴해주는 메서드
    - _ _getitem_ _(self, index) : index번째 입력 데이터(x)와 실제 값(y)을 리턴해주는 메서드

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.X_data = X_train
        self.y_data = y_train

        # 총 데이터의 개수 리턴
        def __len__(self):
            return len(self.X_data)
        
        # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
        def __getitem__(self, index):
            X = self.X_data[index]
            y = self.y_data[index]
            return X, y

In [None]:
train_rawdata = CustomDataset()
train_rawdata.y_data

### ✨TensorDataset
- TensorDataset은 Dataset을 상속한 클래스로 학습 데이터 X와 실제 값 Y를 묶어 놓은 Dataset
- TensorDataset으로 랩핑한 Dataset은 DataLoader로 미니배치를 쉽게 작성할 수 있음

In [25]:
train_rawdata = TensorDataset(X_train, y_train)

In [26]:
print(train_rawdata.tensors[0].shape)   # X_train
print(train_rawdata.tensors[1].shape)   # y_train

torch.Size([3009, 10, 5])
torch.Size([3009, 1])


### ✨Generate Train & Validation Mini-batch

In [27]:
VALIDATION_RATE = 0.2
train_indices, val_indices = train_test_split(
    range(len(train_rawdata)),  # X 인덱스 번호
    test_size=VALIDATION_RATE   # test dataset 비율
)
train_dataset = Subset(train_rawdata, train_indices)
validation_dataset = Subset(train_rawdata, val_indices)

minibatch_size = 128

train_batches = DataLoader(train_dataset, batch_size=minibatch_size, shuffle=True)
val_batches = DataLoader(validation_dataset, batch_size=minibatch_size, shuffle=True)

In [28]:
# 미니배치 하나만 불러와서 이미지 Visualization
X_train, y_train = next(iter(train_batches))
print(X_train.shape, y_train.shape)
# ([minibatch_size,sequence,종가]) ([minibatch_size,종가예측])

torch.Size([128, 10, 5]) torch.Size([128, 1])


### ✨모델 생성

In [29]:
class Net(nn.Module):
    def __init__(self, feature_size, hidden_size, num_layers, dropout_p, output_size, model_type='LSTM'):
        super().__init__()
        if model_type == 'LSTM':
            self.sequenceclassifier = nn.LSTM(
                input_size=feature_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                batch_first=True,
                dropout=dropout_p
            )
        elif model_type == 'GRU':
            self.sequenceclassifier = nn.GRU(
                input_size=feature_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                batch_first=True,
                dropout=dropout_p
            )
        self.fc = nn.Sequential(
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        output, _ = self.sequenceclassifier(x)  # |output|=(128, 10, 32)
        output = output[:, -1, :]   # |output|=(128, 32)
        y = self.fc(output)
        return y

### ✨input, output, loss, optimizer 설정

In [35]:
feature_size = 5
hidden_size = 32
num_layers = 2
dropout_p = 0
output_size = 1

model = Net(feature_size, hidden_size, num_layers, dropout_p, output_size, 'GRU')
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

ValueError: optimizer got an empty parameter list