### seq2seq with teacher forcing

In [None]:
import os
import pandas as pd             #데이터를 Dataframe으로 다루는 라이브러리
from pandas import DataFrame
import numpy as np              #데이터를 행열 또는 array 형태로 다루는 라이브러리
import random

from time import time
from datetime import datetime
import math
from copy import deepcopy

import matplotlib.pyplot as plt #데이터를 그래프 plot으로 보여주는 라이브러리
import seaborn as sns

import requests                 #네트워크 접근 라이브러리
from bs4 import BeautifulSoup   #웹사이트 접근하는 라이브러리

from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
torch.manual_seed(77)

from torch import nn as nn
from torch import optim
from torch.nn import functional as F
from torch import tensor

import warnings
warnings.filterwarnings('ignore')

In [None]:


target = '삼성전자'

# 종목 이름을 입력하면 종목에 해당하는 코드를 불러와
# 네이버 금융(http://finance.naver.com)에 넣어줌

code_krx = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', encoding='cp949', header=0)[0]
# 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌
code_krx


In [None]:
code_krx.종목코드 = code_krx.종목코드.map('{:06d}'.format)
# 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다.
code_krx = code_krx[['회사명', '종목코드']]
# code_krx
target_code = code_krx[code_krx['회사명']==target]
target_code

In [None]:
# 종목코드로 네이버에서 종목 주소 생성 확인
def get_url(code): #(item_name, code_df):
    url = 'https://finance.naver.com/item/sise_day.naver?code='+'{code}'.format(code=code).lstrip()
    return url
code = target_code['종목코드'].iloc[0]
url = get_url(code)
url

In [None]:
# 위에서 찾은 네이버 타겟종목 페이지에서 마지막 페이지 크기 확인
url_page = url + '&page=1'
headers = {'User-agent':'Mozilla/5.0'}
html = requests.get(url_page, headers=headers).text
soup = BeautifulSoup(html,"html.parser")
tags = soup.find_all('a')
# print(tags[11]["href"])
last_page = tags[11]["href"]
last_page = last_page.split('=')[2]
last_page
# print(f"target_code :회사명 == [{target_code['회사명'].iloc[0]}] 종목코드 == [{target_code['종목코드'].iloc[0]}] 마지막 페이지 == [{last_page}]")

In [None]:
last_page = int(150) # 임시 150 페이지 까지만

# 일 데이터를 담을 df_code라는 DataFrame 정의
df_code = pd.DataFrame()
url.lstrip()
for page in range(1,int(last_page)+1):
    # pg_url = ('{url}&page={page}'.format(url=url, page=page)).lstrip()
    # pg = pd.read_html(pg_url,encoding='euc-kr',header=1)[0]
    # df_code = pd.concat([df_code,pg])
    req = requests.get(f'{url}&page={page}',headers=headers)
    df_code = pd.concat([df_code,pd.read_html(req.text,encoding='euc-kr')[0]],ignore_index=True)

# df.dropna()를 이용해 결측값 있는 행 제거
df_code = df_code.dropna()
df_code.reset_index(drop=True, inplace=True)

# 상위 15개 데이터 확인하기
df_code.head(15)

In [None]:
# 한글로 된 컬럼명을 영어로 바꿔줌
stock_data = df_code.rename(columns= {'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'})
stock_data['diff'] = stock_data['diff'].str.replace('상승','').str.replace('하락','').str.replace('보합','').str.replace(',','').str.replace(' ','').str.strip()
# 데이터의 타입을 int형으로 바꿔줌
stock_data[['close', 'diff', 'open', 'high', 'low', 'volume']] = stock_data[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(float)
# 컬럼명 'date'의 타입을 date로 바꿔줌
stock_data['date'] = pd.to_datetime(stock_data['date'])
# 일자(date)를 기준으로  정렬
stock_data = stock_data.sort_values(by=['date'],ascending = True)
# 상위 5개 데이터 확인
print(stock_data.shape)
stock_data.reset_index(drop=True, inplace=True)
stock_data.head(15)

In [None]:
df = stock_data.copy()
df = df.set_index('date')
df

In [None]:
df.plot(subplots=True, figsize=(15,15))

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,3))
ax.plot(df['close'])
ax.vlines(df[df.index == df.iloc[round(len(df)*0.79)].name].index,
          ymax=(max(df['close'])+2), 
          ymin=(min(df['close'])-1), colors='red', linestyles= '-')
ax.vlines(df[df.index == df.iloc[round(len(df)*0.89)].name].index,
          ymax=(max(df['close'])+2), 
          ymin=(min(df['close'])-1), colors='red', linestyles= ':')
ax.vlines(df[df.index == df.iloc[round(len(df)*0.99)].name].index,
          ymax=(max(df['close'])+2), 
          ymin=(min(df['close'])-1), colors='red', linestyles= '-')
ax.set_title(target + ' : ' + code)
ax.set_xlabel('timestemp')
ax.set_ylabel('values')
ax.grid()
fig.tight_layout
plt.show()

In [None]:
# 데이터를 훈련, 검증, 테스트로 나눈다.
df_train = df.iloc[:round(len(df)*0.8)]
df_valid = df.iloc[round(len(df)*0.8):round(len(df)*0.9)]
df_test = df.iloc[round(len(df)*0.9):len(df)]

print(f" df.shape 0.8 : {round(df.shape[0]*0.8)}")
print(f" df_train.shape : {df_train.shape}")
print(f" df_valid.shape : {df_valid.shape}")
print(f" df_test.shape : {df_test.shape}")
# print(f" check {(round(df.shape[0]*0.7)) == df_train.shape[0]+df_valid.shape[0]+df_test.shape[0]}")

In [None]:
scaler = StandardScaler()
scaler.fit(df_train)
df_train_scaled = pd.DataFrame(scaler.transform(df_train),columns=df.columns)
df_valid_scaled = pd.DataFrame(scaler.transform(df_valid),columns=df.columns)
df_test_scaled = pd.DataFrame(scaler.transform(df_test),columns=df.columns)

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=5, predict_length=1):
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.predict_length = predict_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()
        self.datetime = dataframe.index 

    def __len__(self):
        return len(self.X) - (self.sequence_length + self.predict_length)
    
    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.sequence_length
        seq_x = self.X[s_begin:s_end]
        r_begin = s_end #- self.predict_length
        r_end = r_begin + self.predict_length
        seq_y = self.y[r_begin :r_end]
        return seq_x, seq_y

 # start point padding type
    # def __len__(self):
    #     return self.X.shape[0]

    # def __getitem__(self, i): 
    #     if i >= self.sequence_length - 1:
    #         i_start = i - self.sequence_length + 1
    #         x = self.X[i_start:(i + 1), :]
    #     else:
    #         padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
    #         x = self.X[0:(i + 1), :]
    #         x = torch.cat((padding, x), 0)

        return x, self.y[i]

In [None]:
target='close'
features= df.columns
sequence_length = 13
predict_length = 1

train_dataset = SequenceDataset(
    df_train_scaled,
    target=target,
    features=features,
    sequence_length=sequence_length,
    predict_length = predict_length
)

i = 27
X, y = train_dataset[i]
print(X)
print(y)

In [None]:
X, y = train_dataset[i+1]
print(X)
print(y)

In [None]:
target = 'close'
features = df_train_scaled.columns
batch_size = 30
sequence_length = 25
predict_length = 7

train_dataset = SequenceDataset(
    df_train_scaled,
    target=target,
    features=features,
    sequence_length=sequence_length,
    predict_length = predict_length
)
valid_dataset = SequenceDataset(
    df_valid_scaled,
    target=target,
    features=features,
    sequence_length=sequence_length,
    predict_length = predict_length
)
test_dataset = SequenceDataset(
    df_test_scaled,
    target=target,
    features=features,
    sequence_length=sequence_length,
    predict_length = predict_length
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)#, collate_fn=my_collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)#, collate_fn=my_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=True)#,collate_fn=my_collate_fn)

X, y = next(iter(train_loader))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
class Encoder(nn.Module):
    def __init__(self, 
                 num_features, 
                 hidden_units,
                 predict_length, 
                 bias=True, 
                 num_layers=3, 
                 drop_ratio=0.0,
                 bi_dir=False,
                 device='cpu'):
        super().__init__()
        self.num_features = num_features  # feature's = 현재 1
        self.hidden_units = hidden_units
        self.bias = bias
        self.num_layers = num_layers
        self.drop_ratio = drop_ratio
        self.bi_dir = bi_dir
        self.device = device

        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_units,
            bias=bias,
            batch_first=True,
            num_layers=self.num_layers,
            dropout=drop_ratio,
            bidirectional=bi_dir
        )

    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_().to(self.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_().to(self.device)
        out, (hidden_state, cell_state) = self.lstm(x, (h0, c0))

        return out, (hidden_state, cell_state)

In [None]:
class Decoder(nn.Module):
    def __init__(self, 
                 num_features, 
                 hidden_units,
                 predict_length, 
                 bias=True, 
                 num_layers=3, 
                 drop_ratio=0.0,
                 bi_dir=False,
                 device='cpu'):
        super().__init__()
        self.num_features = num_features  # feature's = 현재 1
        self.hidden_units = hidden_units
        self.bias = bias
        self.num_layers = num_layers
        self.drop_ratio = drop_ratio
        self.bi_dir = bi_dir
        self.device = device

        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_units,
            bias=bias,
            batch_first=True,
            num_layers=self.num_layers,
            dropout=drop_ratio,
            bidirectional=bi_dir
        )
        self.linear = nn.Linear(in_features=self.hidden_units, out_features=1, bias=bias)

    def forward(self, initial_input, encoder_outputs, hidden, targets, 
                teacher_force_probability):
        
        decoder_sequence_length = targets.shape[1]
        outputs = torch.full_like(targets, 0).to(self.device)
        
        for t in range(decoder_sequence_length):            
            input_at_t = initial_input.unsqueeze(1)
            output, hidden = self.lstm(input_at_t, hidden)

            # outputs[:,t] = self.linear(output)
            outputs[:,t]  = self.linear(output).squeeze()
            # outputs[:,t] = out
            # print(f"out.shape : {out.shape}")
            # print(f"outputs[:,t].shape : {outputs[:,t].shape}")

            # Set-up input for next timestep
            teacher_force = random.random() < teacher_force_probability
            # The next timestep's input will either be this timestep's 
            # target or output
            input_at_t = targets[:,t] if teacher_force else outputs[:,t]

        return outputs

    # def forecast(self, initial_input, encoder_outputs, hidden, predict_length):
    #     outputs = torch.zeros_like((predict_length)).to(self.device)
    #     input_at_t = initial_input
    #     for t in range(predict_length):            
    #         output, hidden = self.lstm(input_at_t, hidden)
    #         outputs[t] = self.linear(output)
    #         input_at_t = outputs[t]
    #     return outputs

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, encoder_inputs, targets, teacher_force_probability):
        encoder_outputs, hidden = self.encoder(encoder_inputs)
        
        outputs = self.decoder(encoder_inputs[:,-1,:], encoder_outputs,
                               hidden, targets, teacher_force_probability)
        return outputs

    # def forecast(self, encoder_inputs, predict_length):
    #     encoder_outputs, hidden = self.encoder(encoder_inputs)
    #     outputs = self.decoder.forecast(encoder_inputs[-1], encoder_outputs,
    #                            hidden, predict_length)
        # return outputs

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f" 🤗 is_available device : \n\t{device}")

epochs = 30

learning_rate = 0.03
num_hidden_units = 124
hidden_size = 256
teacher_force_probability = 0.5

seq2seq = Seq2Seq(
                    Encoder(
                            num_features = len(features), 
                            hidden_units = num_hidden_units,
                            predict_length = predict_length, 
                            bias=True, 
                            num_layers=3, 
                            drop_ratio=0.0,
                            bi_dir=False,
                            device=device), 
                    Decoder(
                            num_features = len(features), 
                            hidden_units = num_hidden_units,
                            predict_length = predict_length, 
                            bias=True, 
                            num_layers=3, 
                            drop_ratio=0.0,
                            bi_dir=False,
                            device=device),
                )

loss_function = nn.MSELoss()
# loss_function = nn.L1Loss()

optimizer = torch.optim.Adam(seq2seq.parameters(), lr=learning_rate)

print(f"\n 🤩 loss_function : \n\t{loss_function}\n")
print(f" 🤓 optimizer : \n\t{optimizer}")

In [None]:
def train_model(data_loader, model, loss_function, optimizer, device, teacher_force_probability):
    num_batches = len(data_loader)
    total_loss = 0
    model.to(device)
    
    model.train()
    for i, (X, y) in enumerate(data_loader):
        X = X.to(device)
        y = y.to(device)
        output = model(X,y,teacher_force_probability)
        loss = loss_function(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")
    return avg_loss

def eval_model(data_loader, model, loss_function, device):
    num_batches = len(data_loader)
    total_loss = 0
    
    model.to(device)

    teacher_force_probability = 0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            X = X.to(device)
            y = y.to(device)
            output = model(X, y, teacher_force_probability)

            # print(f"output.type -  output.shape: {type(output)} - {output.shape}")
            # print(f"y.shape -  y.type : {type(y)} - {y.shape}")
            # print(f"one out : {output[0,:]}")
            # print(f"loss_function(output, y) : {loss_function(output, y)}")
            # raise

            total_loss += loss_function(output, y).item()

    avg_loss = total_loss / num_batches
    print(f"eval loss: {avg_loss}")
    return avg_loss


def infer_model(data_loader, model, device):
    num_batches = len(data_loader)
    total_loss = 0
    predict = []
    label = []
    model.to(device)

    teacher_force_probability = 0.0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            X = X.to(device)
            y = y.to(device)
            output = model(X, y, teacher_force_probability)
            # print(output.shape)
            # raise
            pred = output.to('cpu').numpy()
            np.squeeze(pred)
            predict.append(list(pred[0]))
            true = y.to('cpu').numpy()
            np.squeeze(true)
            label.append(list(true[0]))
        
    return predict, label


# def forecasting (data, model, device, predict_length):
#     model.to(device)
#     model.eval()
#     with torch.no_grad():
#             X = torch.from_numpy(data).to(torch.float32).to(device)
#             X = X.unsqueeze(0)
#             output = model.forecast(X, predict_length)
#             forecast = output.to('cpu').numpy()
#             forecast = np.squeeze(forecast[0])
        
    # return forecast


In [None]:
print("Untrained evaluate\n--------")
eval_model(valid_loader, seq2seq, loss_function, device)
print()

epoch = []
train_loss_list = []
eval_loss_list = []

for ix_epoch in range(epochs):
    print(f"Epoch {ix_epoch}\n---------")
    train_loss = train_model(train_loader, seq2seq, loss_function, optimizer, device, teacher_force_probability)
    eval_loss = eval_model(valid_loader, seq2seq, loss_function, device)
    print()
    epoch.append(ix_epoch+1)
    train_loss_list.append(train_loss)
    eval_loss_list.append(eval_loss)
result = pd.DataFrame({'epoch':epoch, 'train_loss':train_loss_list, 'eval':eval_loss_list})
result

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,5))
ax.plot(result['epoch'],result['train_loss'], color='blue', label='train_loss')
ax.plot(result['epoch'],result['eval'], color='green', label='eval_loss')
ax.legend()
ax.grid()
plt.show()

In [None]:
pred, gt = infer_model(test_loader, seq2seq, device)

In [None]:
# 모델 세이브
PATH = "/home/augustine77/mylab/sim/sim/02_DLnote/results/LSTM_samsumg.pth"
torch.save(model.state_dict(), PATH)

In [None]:
# 모델 로드
model.load_state_dict(torch.load(PATH))

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error