<a href="https://colab.research.google.com/github/jiwoong2/deeplearning/blob/main/Short_term_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module

In [None]:
!pip install yfinance
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import torch
from sklearn.preprocessing import StandardScaler
from torch import nn, optim
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
sys.path.append("/content/drive/MyDrive/Colab Notebooks")
import yfinance as yf
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Handle

In [None]:
criterion = nn.CrossEntropyLoss()
LR = 1e-7 # -1 인경우 한번에 0또는 1만 출력하는 로컬 미니멈으로 수렴.
EPOCH = 400
batch_size = 70 #데이터셋이 작은데 배치사이즈가 너무크면 문제가생김. 훈련 loss,와 검증 loss 역전, loss 하락폭 감소등 확인.
               #배치사이즈가 크면 gpu에 많은 data를 올릴 수 있어 훈련이 빨라지짐.
               #배치사이즈가 굉장히 중요한 역할을 하는듯. 이유를 알아보자.

# Short term model data

In [None]:
# 애플의 주가데이터 불러오기.
apple = yf.Ticker("AAPL")
apple = apple.history(period="25y")
apple = apple.reset_index() # 인덱스 초기화

In [None]:
# 날짜, 배당, 분할데이터 누락
apple.drop(['Date','Dividends','Stock Splits'], axis=1, inplace=True)

In [None]:
# 데이터 프레임을 넘파이배열로 변환.
apple = apple.to_numpy()

In [None]:
apple.shape

In [None]:
# data, label생성기.
# Size:데이터개수.
# data:원본 data
# interval:시계열길이. ex) 30 -> 30일치 데이터를 보고 다음 5일간의 주가변동을 예측.
def data_generator(Size:int, data, interval:int):

    s = np.random.choice(range(0, len(data)-interval-6), size=Size, replace=False)
    s = np.array(s)

    g_data = []
    g_label = []

    for i in s:
        gs_data = data[i:i+interval, :]
        g_data.append(gs_data)

        label = (data[i+interval+6,2] - data[i+interval+1,2]) / data[i+interval+1,2] * 100

        if label >= 0:
            g_label.append(1)

        # elif 0 <= label < 5:
        #     g_label.append(2)

        # elif -5 < label < 0:
        #     g_label.append(1)

        elif label < 0:
            g_label.append(0)

    g_data = np.array(g_data).astype(np.float32)
    g_label = np.array(g_label)

    return g_data, g_label

In [None]:
short_term_data, short_term_label = data_generator(5000, apple, 20)

In [None]:
# 데이터 표준화.
# 주가데이터는 지수적으로 증가하므로 각 샘플마다 따로 표준화함.
def data_scaler(data):
    for sample in range(data.shape[0]):
        scaler = StandardScaler()
        std = scaler.fit_transform(data[sample,:,:])
        data[sample,:,:] = std

In [None]:
data_scaler(short_term_data)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 인스턴스 생성.
short_term_dataset = CustomDataset(short_term_data, short_term_label)

#데이터를 훈련 데이터와 검증 데이터, 테스트 데이터로 나누기. 파이토치데이터셋 인스턴스에 사용하면 라벨도 같이 분류됨.
#훈련세트 80%, 테스트세트 20% 분류
train_size = int(0.8 * len(short_term_dataset))
test_size = len(short_term_dataset) - train_size

short_term_train_dataset, short_term_test_dataset = random_split(short_term_dataset, [train_size, test_size])

#테스트세트를 테스트세트 10%, 검증세트 10% 분류
test_size = int(0.5 * len(short_term_test_dataset))
val_size = len(short_term_test_dataset) - test_size

short_term_test_dataset, short_term_val_dataset = random_split(short_term_test_dataset, [test_size, val_size])

# 데이터 로더 생성.
short_term_train_DL = DataLoader(short_term_train_dataset, batch_size=batch_size, shuffle=True)
short_term_val_DL = DataLoader(short_term_val_dataset, batch_size=batch_size, shuffle=True)
short_term_test_DL = DataLoader(short_term_test_dataset, batch_size=batch_size, shuffle=True)

# 데이터 로더를 통해 데이터 확인
for features, labels in short_term_train_DL:
    print(features.shape)
    print(labels)
    break

In [None]:
print(len(short_term_train_DL.dataset))
print(len(short_term_test_DL.dataset))
print(len(short_term_val_DL.dataset))

# Short term model

In [None]:
class ShortTermModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(nn.Conv2d(1, 20, 5, stride=1),
                                   nn.BatchNorm2d(20),
                                   nn.GELU())

        self.conv2 = nn.Sequential(nn.Conv2d(1, 40, kernel_size=(20, 5), stride=1),
                                   nn.BatchNorm2d(40),
                                   nn.GELU())

        self.conv3 = nn.Sequential(nn.Conv2d(1, 80, kernel_size=(40, 5), stride=1),
                                   nn.BatchNorm2d(80),
                                   nn.GELU())

        self.conv4 = nn.Sequential(nn.Conv2d(1, 160, kernel_size=(80, 5), stride=1),
                                   nn.BatchNorm2d(160),
                                   nn.GELU())

        self.linear = nn.Sequential(nn.Linear(640,100), nn.GELU(),
                                    nn.Linear(100,2))

    def forward(self, x):

        x = x.permute(0,2,1)
        x = x.unsqueeze(1)
        x = self.conv1(x)
        x = x.squeeze().unsqueeze(1)
        x = self.conv2(x)
        x = x.squeeze().unsqueeze(1)
        x = self.conv3(x)
        x = x.squeeze().unsqueeze(1)
        x = self.conv4(x)
        x = x.squeeze().unsqueeze(1)
        x = torch.flatten(x, start_dim=1)
        x = self.linear(x)

        return x

In [None]:
model = ShortTermModel().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr = LR)

# Trainer

In [None]:
# 1에폭 트레이닝.
def loss_epoch(model, DL, criterion, optimizer = None):

    N = len(DL.dataset) # the number of data
    rloss = 0; rcorrect = 0

    for x_batch, y_batch in DL:

        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        # inference
        y_hat = model(x_batch)

        # loss
        loss = criterion(y_hat,y_batch)

        # update
        if optimizer is not None:
            optimizer.zero_grad() # gradient 누적을 막기 위한 초기화
            loss.backward() # backpropagation
            optimizer.step() # weight update

        # loss accumulation
        loss_b = loss.item() * x_batch.shape[0] # batch loss # BATCH_SIZE를 곱하면 마지막 18개도 32개를 곱하니까..
        rloss += loss_b # running loss
        # accuracy accumulation
        pred = y_hat.argmax(dim=1)
        corrects_b = torch.sum(pred == y_batch).item()
        rcorrect += corrects_b

    loss_e = rloss/ N # epoch loss
    accuracy_e = rcorrect/N*100

    return loss_e, accuracy_e, rcorrect

def Train(model, train_DL, val_DL, criterion, optimizer, EPOCH):

    loss_history = {"train":[], "val":[]}
    acc_history = {"train":[], "val":[]}

    for ep in tqdm(range(EPOCH), leave=False):

        model.train() # train mode로 전환
        train_loss, train_acc, _ = loss_epoch(model, train_DL, criterion, optimizer)
        loss_history["train"] += [train_loss]
        acc_history["train"] += [train_acc]

        model.eval() # test mode로 전환
        with torch.no_grad():
            val_loss, val_acc, _ = loss_epoch(model, val_DL, criterion)
            loss_history["val"] += [val_loss]
            acc_history["val"] += [val_acc]

        # # print loss
        # print(f"train loss: {round(train_loss,5)}, "
        #       f"val loss: {round(val_loss,5)} \n"
        #       f"train acc: {round(train_acc,1)} %, "
        #       f"val acc: {round(val_acc,1)} %, time: {round(time.time()-epoch_start)} s")
        # print("-"*20)

    return loss_history, acc_history

def Test(model, test_DL, criterion):
    model.eval()
    with torch.no_grad():
        test_loss, test_acc, rcorrect = loss_epoch(model, test_DL, criterion)
    print()
    print(f"Test loss: {round(test_loss,5)}")
    print(f"Test accuracy: {rcorrect}/{len(test_DL.dataset)} ({round(test_acc,1)} %)")
    return round(test_acc,1)

# Experience

In [None]:
loss, acc = Train(model, short_term_train_DL, short_term_val_DL, criterion, optimizer, EPOCH)

In [None]:
plt.plot(range(1, EPOCH + 1), loss['train'], label='Train Loss')
plt.plot(range(1, EPOCH + 1), loss['val'], label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('loss')

plt.title('Training and Validation Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(range(1, EPOCH + 1), acc['train'], label='Train acc')
plt.plot(range(1, EPOCH + 1), acc['val'], label='Validation acc')

plt.xlabel('Epoch')
plt.ylabel('acc')

plt.title('Training and Validation acc')
plt.legend()

plt.show()

In [None]:
Test(model, short_term_test_DL, criterion)

# Model save

In [None]:
#전체 모델 저장하기
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/results/short_term_model.pt')

# # 나중에 전체 모델 로드하기
# model = torch.load('model_complete.pth')
# model.eval()  # 추론 모드로 설정