<a href="https://colab.research.google.com/github/jiwoong2/deeplearning/blob/main/Short_term_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from torch import nn, optim
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import yfinance as yf
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
import os
from torchsummary import summary
from contextlib import redirect_stdout
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
print(DEVICE)

# Handle

In [None]:
LR = 1e-7 # -1 인경우 한번에 0또는 1만 출력하는 로컬 미니멈으로 수렴.
EPOCH = 200
batch_size = 50 #데이터셋이 작은데 배치사이즈가 너무크면 문제가생김. 훈련 loss,와 검증 loss 역전, loss 하락폭 감소등 확인.
               #배치사이즈가 크면 gpu에 많은 data를 올릴 수 있어 훈련이 빨라지짐.
               #배치사이즈가 굉장히 중요한 역할을 하는듯. 이유를 알아보자.
graph_save_path = '/content/drive/MyDrive/Colab Notebooks/graph/1'

# Short term model data

In [None]:
def load_stock_data(Ticker, peri):

    ticker = yf.Ticker(Ticker)
    stock_data = ticker.history(period=peri)
    stock_data = stock_data.reset_index() # 인덱스 초기화
    stock_data.drop(['Date','Dividends','Stock Splits'], axis=1, inplace=True) # 불필요한 data 누락
    stock_data = stock_data.to_numpy()
    return stock_data

tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "NVDA", "META", "NFLX", "AMD", "TSLA", "INTC", "PYPL", "QCOM", "V", "XOM", "DIS",
           "LRCX", "LULU", "AMGN", "MRNA", "BKNG", "SBUX", "ILMN", "ADBE", "PEP", "XEL", "MCHP", "TEAM", "EA", "COST", "GILD",
           "FANG", "ANSS", "CSGP", "PCAR", "KDP", "WBD", "WDAY"]

stock_data_dict = {}

for ticker in tickers:
    stock_data_dict[ticker] = load_stock_data(ticker, "25y")

In [None]:
# 로그변환.
# logged_close_prices = np.log(apple[:, 0:3])

# # 원본 배열에 로그 변환된 'Close' 가격을 다시 넣음
# apple[:, 0:3] = logged_close_prices

In [None]:
# data, label생성기.
# Size:데이터개수.
# data:원본 data
# interval:시계열길이. ex) 30 -> 30일치 데이터를 보고 다음 5일간의 주가변동을 예측.
def data_generator(Size:int, data, interval:int):

    idx = np.random.choice(range(interval, len(data)-6), size=Size, replace=False)
    idx = np.array(idx)

    g_data = []
    g_label = []

    for i in idx:
        gs_data = data[i-interval:i, :]
        g_data.append(gs_data)

        label = (data[i+6,3] - data[i+1,3]) / data[i+1,3] * 100

        if  label >= 5:
            g_label.append(3)

        elif 0 <= label < 5:
            g_label.append(2)

        elif -5 <= label < 0:
            g_label.append(1)

        elif label < -5:
            g_label.append(0)

    g_data = np.array(g_data).astype(np.float32)
    g_label = np.array(g_label)

    return g_data, g_label

In [None]:
# 종목별 데이터를 저장할 딕셔너리
stock_data = {}

# 샘플링할 개수 목록
sample_sizes = [4000, 3000, 2000, 1000]

# 각 종목에 대해 데이터 생성
for ticker in tickers:
    # 가능한 샘플링 개수에 대해 시도
    for size in sample_sizes:
        try:
            data, label = data_generator(size, stock_data_dict[ticker], 20)
            stock_data[ticker + "_short_term_data"] = data
            stock_data[ticker + "_short_term_label"] = label
            break  # 성공적으로 데이터를 생성했으므로 반복 중지
        except Exception as e:
            print(f"{ticker}: {size}개 샘플링 실패 - {e}")
            continue  # 다음 샘플링 개수로 시도

In [None]:
short_term_data_list = []
short_term_label_list = []

for ticker in tickers:
    short_term_data_list.append(stock_data[ticker + "_short_term_data"])
    short_term_label_list.append(stock_data[ticker + "_short_term_label"])

short_term_data = np.concatenate(short_term_data_list, axis=0)
short_term_label = np.concatenate(short_term_label_list, axis=0)

In [None]:
sum(short_term_label==0)

In [None]:
print(short_term_data.shape)
print(short_term_label.shape)
print(sum(short_term_label == 3))

In [None]:
# 예시: X는 3차원 데이터, y는 라벨
# X.shape -> (샘플 수, 높이, 너비)
# y.shape -> (샘플 수,)
X = short_term_data
y = short_term_label

# 클래스별 샘플 인덱스 추출
class_indices = {label: np.where(y == label)[0] for label in np.unique(y)}

# 가장 작은 클래스 크기 결정
min_size = min(len(indices) for indices in class_indices.values())

# 각 클래스에서 무작위로 샘플 선택
under_sampled_indices = np.array([], dtype=int)
for indices in class_indices.values():
    under_sampled_indices = np.concatenate([under_sampled_indices, np.random.choice(indices, min_size, replace=False)])

# 선택된 샘플로 데이터셋 생성
short_term_data = X[under_sampled_indices]
short_term_label = y[under_sampled_indices]

In [None]:
# 데이터 표준화.
# 주가데이터는 지수적으로 증가하므로 각 샘플마다 따로 표준화함.
def data_scaler(data):
    for sample in range(data.shape[0]):
        scaler = StandardScaler()
        std = scaler.fit_transform(data[sample,:,:])
        data[sample,:,:] = std

data_scaler(short_term_data)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 언더샘플링

# 인스턴스 생성.
short_term_dataset = CustomDataset(short_term_data, short_term_label)

#데이터를 훈련 데이터와 검증 데이터, 테스트 데이터로 나누기. 파이토치데이터셋 인스턴스에 사용하면 라벨도 같이 분류됨.
#훈련세트 80%, 테스트세트 20% 분류
train_size = int(0.8 * len(short_term_dataset))
test_size = len(short_term_dataset) - train_size

short_term_train_dataset, short_term_test_dataset = random_split(short_term_dataset, [train_size, test_size])

#테스트세트를 테스트세트 10%, 검증세트 10% 분류
test_size = int(0.2 * len(short_term_test_dataset))
val_size = len(short_term_test_dataset) - test_size

short_term_test_dataset, short_term_val_dataset = random_split(short_term_test_dataset, [test_size, val_size])

# 데이터 로더 생성.
short_term_train_DL = DataLoader(short_term_train_dataset, batch_size=batch_size, shuffle=True)
short_term_val_DL = DataLoader(short_term_val_dataset, batch_size=batch_size, shuffle=True)
short_term_test_DL = DataLoader(short_term_test_dataset, batch_size=batch_size, shuffle=True)

# 데이터 로더를 통해 데이터 확인
for features, labels in short_term_train_DL:
    print(features.shape)
    print(labels)
    break

In [None]:
print(len(short_term_train_DL.dataset))
print(len(short_term_test_DL.dataset))
print(len(short_term_val_DL.dataset))

In [None]:
sum(short_term_train_DL.dataset[:][1] == 0)

In [None]:
class_counts = [sum(short_term_train_DL.dataset[:][1] == 0), sum(short_term_train_DL.dataset[:][1] == 1), sum(short_term_train_DL.dataset[:][1] == 2), sum(short_term_train_DL.dataset[:][1] == 3)]
total = sum(class_counts)
class_weights = [total / count for count in class_counts]
class_weights = torch.FloatTensor(class_weights)
print(class_weights)

criterion = nn.CrossEntropyLoss(weight=class_weights)
criterion = criterion.to(DEVICE) # loss에 가중치를 부여하는경우 손실함수도 gpu에 올려야함.

# Short term model

In [None]:
# class ShortTermModel(nn.Module):
#     def __init__(self):
#         super().__init__()

#         self.conv1 = nn.Sequential(nn.Conv2d(1, 30, 5, stride=1),
#                                    nn.BatchNorm2d(30),
#                                    nn.ReLU(),
#                                    nn.Dropout2d(0.7))

#         self.conv2 = nn.Sequential(nn.Conv2d(1, 60, kernel_size=(30, 5), stride=1),
#                                    nn.BatchNorm2d(60),
#                                    nn.ReLU(),
#                                    nn.Dropout2d(0.7))

#         self.conv3 = nn.Sequential(nn.Conv2d(1, 120, kernel_size=(60, 5), stride=1),
#                                    nn.BatchNorm2d(120),
#                                    nn.ReLU(),
#                                    nn.Dropout2d(0.7))

#         self.conv4 = nn.Sequential(nn.Conv2d(1, 240, kernel_size=(120, 5), stride=1),
#                                    nn.BatchNorm2d(240),
#                                    nn.ReLU(),
#                                    nn.Dropout2d(0.7))

#         self.linear = nn.Sequential(nn.Linear(960,100), nn.GELU(),
#                                     nn.Dropout(0.7),
#                                     nn.Linear(100,4))

#     def forward(self, x):

#         x = x.permute(0,2,1)
#         x = x.unsqueeze(1)
#         x = self.conv1(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv2(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv3(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv4(x)
#         x = x.squeeze().unsqueeze(1)
#         x = torch.flatten(x, start_dim=1)
#         x = self.linear(x)

#         return x

# model = ShortTermModel().to(DEVICE)
# optimizer = optim.Adam(model.parameters(), lr = LR)

In [None]:
# class ShortTermModel(nn.Module):
#     def __init__(self):
#         super().__init__()

#         self.conv1 = nn.Sequential(nn.Conv2d(1, 5, 5, stride=1),
#                                    nn.BatchNorm2d(5),
#                                    nn.GELU())

#         self.conv2 = nn.Sequential(nn.Conv2d(1, 10, kernel_size=(5, 5), stride=1),
#                                    nn.BatchNorm2d(10),
#                                    nn.GELU())

#         self.conv3 = nn.Sequential(nn.Conv2d(1, 20, kernel_size=(10, 5), stride=1),
#                                    nn.BatchNorm2d(20),
#                                    nn.GELU())

#         self.conv4 = nn.Sequential(nn.Conv2d(1, 40, kernel_size=(20, 5), stride=1),
#                                    nn.BatchNorm2d(40),
#                                    nn.GELU())

#         self.linear = nn.Sequential(nn.Linear(160,4))

#     def forward(self, x):

#         x = x.permute(0,2,1)
#         x = x.unsqueeze(1)
#         x = self.conv1(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv2(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv3(x)
#         x = x.squeeze().unsqueeze(1)
#         x = self.conv4(x)
#         x = x.squeeze().unsqueeze(1)
#         x = torch.flatten(x, start_dim=1)
#         x = self.linear(x)

#         return x

# model = ShortTermModel().to(DEVICE)
# optimizer = optim.Adam(model.parameters(), lr = LR)

In [None]:
# class RNN(nn.Module):
#     def __init__(self):
#         super().__init__()

#         self.rnn = nn.RNN(5,30,3,batch_first = True)
#         self.linear = nn.Linear(30,4)

#     def forward(self, x):
#         _, x = self.rnn(x)
#         x = x[-1,:,:]
#         # x = x.squeeze(0)
#         x = self.linear(x)
#         return x

# model = RNN().to(DEVICE)
# optimizer = optim.Adam(model.parameters(), lr = LR)

In [None]:
class LSTM(nn.Module):
    def __init__(self):
        super().__init__()

        self.lstm = nn.LSTM(5, 30, 3, batch_first=True)

        # 선형 레이어 정의
        self.linear = nn.Linear(30, 4)

    def forward(self, x):

        # LSTM 레이어를 통과
        _, x = self.lstm(x) #out1은 각 시간의 출력값, out2는 마지막시간의 히든과 셀상태의 튜플로 반환됨.

        # out2[0][-1,:,:] out2[0]은 마지막 시간의 히든상태. 뒤의 슬라이싱은 스택된 레이어중 가장마지막레이어의
        # 히든상태를 반환.
        # 마지막 시간 단계의 출력을 선형 레이어로 전달
        x = self.linear(x[0][-1,:,:])
        return x


model = LSTM()

optimizer = optim.Adam(model.parameters(), lr= LR)

In [None]:
# for i in short_term_train_DL:
#     test_data = i[0]
#     break

# test_data.shape

In [None]:
# model = LSTM()
# a = model(test_data)
# print(a.shape)

# Trainer

In [None]:
# 1에폭 트레이닝.
def loss_epoch(model, DL, criterion, optimizer = None):

    N = len(DL.dataset) # the number of data
    rloss = 0; rcorrect = 0

    for x_batch, y_batch in DL:

        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        # inference
        y_hat = model(x_batch)

        # loss
        loss = criterion(y_hat,y_batch)

        # update
        if optimizer is not None:
            optimizer.zero_grad() # gradient 누적을 막기 위한 초기화
            loss.backward() # backpropagation
            optimizer.step() # weight update

        # loss accumulation
        loss_b = loss.item() * x_batch.shape[0] # batch loss # BATCH_SIZE를 곱하면 마지막 18개도 32개를 곱하니까..
        rloss += loss_b # running loss
        # accuracy accumulation
        pred = y_hat.argmax(dim=1)
        corrects_b = torch.sum(pred == y_batch).item()
        rcorrect += corrects_b

    loss_e = rloss/ N # epoch loss
    accuracy_e = rcorrect/N*100

    return loss_e, accuracy_e, rcorrect

def Train(model, train_DL, val_DL, criterion, optimizer, EPOCH):

    loss_history = {"train":[], "val":[]}
    acc_history = {"train":[], "val":[]}

    for ep in tqdm(range(EPOCH), leave=False):

        model.train() # train mode로 전환
        train_loss, train_acc, _ = loss_epoch(model, train_DL, criterion, optimizer)
        loss_history["train"] += [train_loss]
        acc_history["train"] += [train_acc]

        model.eval() # test mode로 전환
        with torch.no_grad():
            val_loss, val_acc, _ = loss_epoch(model, val_DL, criterion)
            loss_history["val"] += [val_loss]
            acc_history["val"] += [val_acc]

        # # print loss
        # print(f"train loss: {round(train_loss,5)}, "
        #       f"val loss: {round(val_loss,5)} \n"
        #       f"train acc: {round(train_acc,1)} %, "
        #       f"val acc: {round(val_acc,1)} %, time: {round(time.time()-epoch_start)} s")
        # print("-"*20)

    return loss_history, acc_history

def Test(model, test_DL, criterion):
    model.eval()
    with torch.no_grad():
        test_loss, test_acc, rcorrect = loss_epoch(model, test_DL, criterion)
    print()
    print(f"Test loss: {round(test_loss,5)}")
    print(f"Test accuracy: {rcorrect}/{len(test_DL.dataset)} ({round(test_acc,1)} %)")
    return round(test_acc,1)

def graph(loss, acc, EPOCH, batch_size, LR, graph_save_path=None):
    # 두 개의 그래프를 하나의 figure에 가로로 배열
    fig, axs = plt.subplots(1, 2, figsize=(10, 4))

    # 첫 번째 그래프 (Train Loss와 Validation Loss)
    axs[0].plot(range(1, EPOCH + 1), loss['train'], label='Train Loss')
    axs[0].plot(range(1, EPOCH + 1), loss['val'], label='Validation Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].set_ylabel('Loss')
    axs[0].set_title(f'LOSS GRAPH    LR:{LR}, EPOCH:{EPOCH}, Batch size:{batch_size}')
    axs[0].legend()

    # 두 번째 그래프 (Train Accuracy와 Validation Accuracy)
    axs[1].plot(range(1, EPOCH + 1), acc['train'], label='Train acc')
    axs[1].plot(range(1, EPOCH + 1), acc['val'], label='Validation acc')
    axs[1].set_xlabel('Epoch')
    axs[1].set_ylabel('Accuracy')
    axs[1].set_title(f'AC GRAPH    LR:{LR}, EPOCH:{EPOCH}, Batch size:{batch_size}')
    axs[1].legend()

    # 그래프 출력
    plt.tight_layout()  # 그래프 간격 조정

    # 그래프를 이미지 파일로 저장
    if graph_save_path:
        plt.savefig(graph_save_path)
        plt.close()
    else:
        plt.show()

def test_matrix(model, DL, matrix_save_path=None):
    model.eval()  # 평가 모드로 설정
    y_pred = []
    y_true = []

    model.to('cpu')
    with torch.no_grad():  # 그래디언트 계산 비활성화
        for inputs, labels in DL:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            y_pred.extend(predicted.numpy())  # 예측값 저장
            y_true.extend(labels.numpy())    # 실제 레이블 저장

    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.ylabel('lable')
    plt.xlabel('predict')

    # 그래프를 이미지 파일로 저장
    if matrix_save_path:
        plt.savefig(matrix_save_path)
        plt.close()
    else:
        plt.show()

# Experience

In [None]:
for LR in [5e-5]:
    for epoch in [500]:
        for batch in [150]:

            #모델 초기화
            model = LSTM().to(DEVICE)

            #배치사이즈 설정
            short_term_train_DL = DataLoader(short_term_train_dataset, batch_size=batch, shuffle=True)
            short_term_val_DL = DataLoader(short_term_val_dataset, batch_size=batch, shuffle=True)
            short_term_test_DL = DataLoader(short_term_test_dataset, batch_size=batch, shuffle=True)

            #생성된 데이터로더의 업데이트가중치
            class_counts = [sum(short_term_train_DL.dataset[:][1] == 0), sum(short_term_train_DL.dataset[:][1] == 1), sum(short_term_train_DL.dataset[:][1] == 2), sum(short_term_train_DL.dataset[:][1] == 3)]
            total = sum(class_counts)
            class_weights = [total / count for count in class_counts]
            class_weights = torch.FloatTensor(class_weights)
            criterion = nn.CrossEntropyLoss(weight=class_weights)
            criterion = criterion.to(DEVICE) # loss에 가중치를 부여하는경우 손실함수도 gpu에 올려야함.

            #옵티마이저 재설정
            optimizer = optim.Adam(model.parameters(), lr = LR)

            #학습
            loss, acc = Train(model, short_term_train_DL, short_term_val_DL, criterion, optimizer, epoch)

            # 새 폴더 생성 경로 설정
            folder_path = f'/content/drive/MyDrive/Colab Notebooks/graph/LSTMunder3{LR}{epoch}{batch}'
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            # 파일 저장 경로 설정
            file_path = os.path.join(folder_path, 'model_summary.txt')

            # 요약된 모델 아키텍처를 직접 파일에 리디렉션합니다.
            # LSTM레이러는 요약이 안됨. 튜플이 반환돼는게 문제인듯.
            # with open(file_path, 'w') as f:
            #     with redirect_stdout(f):
            #         summary(model, (30, 5))

            #결과그래프
            graph(loss, acc, epoch, batch, LR, folder_path+'/graph.png')

            #혼동행렬
            test_matrix(model, short_term_test_DL, folder_path+'/matrix.png')

In [None]:
model.to(DEVICE)
Test(model, short_term_test_DL, criterion)

In [None]:
print(sum(short_term_test_DL.dataset[:][1]==0)/short_term_test_DL.dataset[:][1].size*100)
print(sum(short_term_test_DL.dataset[:][1]==1)/short_term_test_DL.dataset[:][1].size*100)
print(sum(short_term_test_DL.dataset[:][1]==2)/short_term_test_DL.dataset[:][1].size*100)
print(sum(short_term_test_DL.dataset[:][1]==3)/short_term_test_DL.dataset[:][1].size*100)

# Model save

In [None]:
#전체 모델 저장하기
#torch.save(model, '/content/drive/MyDrive/Colab Notebooks/results/short_term_model.pt')

# # 나중에 전체 모델 로드하기
# model = torch.load('model_complete.pth')
# model.eval()  # 추론 모드로 설정