In [1]:
##rom google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = os.getcwd()
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

- https://pandas-datareader.readthedocs.io/en/latest/remote_data.html#remote-data-naver

In [2]:
import pandas_datareader.data as web
df = web.DataReader('005930', 'naver', start='2022-01-01', end='2022-12-31')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03,79400,79800,78200,78600,13502112
2022-01-04,78800,79200,78300,78700,12427416
2022-01-05,78800,79000,76400,77400,25470640
2022-01-06,76700,77600,76600,76900,12931954
2022-01-07,78100,78400,77400,78300,15163757
...,...,...,...,...,...
2022-12-23,58200,58400,57700,58100,9829407
2022-12-26,58000,58100,57700,57900,6756411
2022-12-27,58000,58400,57900,58100,10667027
2022-12-28,57600,57600,56400,56600,14665410


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 246 entries, 2022-01-03 to 2022-12-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Open    246 non-null    object
 1   High    246 non-null    object
 2   Low     246 non-null    object
 3   Close   246 non-null    object
 4   Volume  246 non-null    object
dtypes: object(5)
memory usage: 11.5+ KB


In [4]:
df = df.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 246 entries, 2022-01-03 to 2022-12-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Open    246 non-null    int32
 1   High    246 non-null    int32
 2   Low     246 non-null    int32
 3   Close   246 non-null    int32
 4   Volume  246 non-null    int32
dtypes: int32(5)
memory usage: 6.7 KB


In [5]:
df.head(50)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-03,79400,79800,78200,78600,13502112
2022-01-04,78800,79200,78300,78700,12427416
2022-01-05,78800,79000,76400,77400,25470640
2022-01-06,76700,77600,76600,76900,12931954
2022-01-07,78100,78400,77400,78300,15163757
2022-01-10,78100,78100,77100,78000,9947422
2022-01-11,78400,79000,78000,78900,13221123
2022-01-12,79500,79600,78600,78900,11000502
2022-01-13,79300,79300,77900,77900,13889401
2022-01-14,77700,78100,77100,77300,10096725


- numpy 배열로 변경

In [6]:
data = df.to_numpy()
data.shape

(246, 5)

- minmax 스케일링을 하기위해 통계치 추출

In [7]:
mins = data.min(axis=0)
sizes = data.max(axis=0) - mins

In [8]:
mins

array([  52300,   53600,   51800,   52600, 5767902])

In [9]:
sizes

array([   27200,    26200,    26800,    26300, 32175455])

# 데이터셋 생성하는 함수

In [10]:
def transform_data(data, mins, sizes, seq_len = 10, pred_len = 5):
    # 스케일링
    data = (data - mins) / sizes

    x_list = []
    y_list = []
    # 반복문 돌려서 데이터셋 생성
    for i in range(seq_len, data.shape[0]+1-pred_len):
        x = data[i-seq_len:i] # 0:10, 1:11, 2:12 .....
        y = data[i-1:i+pred_len, 3] # 9:15, 10:16, ....

        x_list.append(x)
        y_list.append(y)

    return np.array(x_list), np.array(y_list)

In [11]:
seq_len = 10
pred_len = 5
x_arr, y_arr = transform_data(data, mins, sizes, seq_len = seq_len, pred_len = pred_len)
x_arr.shape, y_arr.shape

((232, 10, 5), (232, 6))

In [12]:
y_arr[:,1:].shape # 손실 계산할 때 sos에 해당되는 실제 이전일 값 제외하고 계산!!

(232, 5)

```
실제 예측시
[
    [실제이전일값, 0, 0, 0, 0, 0 ],
    [실제이전일값, 0, 0, 0 , 0, 0 ],
    .....
]
```

# 데이터셋 클래스

In [13]:
class FinanceDataset(torch.utils.data.Dataset):
    def __init__(self,x , y):
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        item["y"] = torch.Tensor(self.y[idx])
        return item

In [14]:
dt = FinanceDataset(x_arr, y_arr)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[[0.9963, 1.0000, 0.9851, 0.9886, 0.2404],
          [0.9743, 0.9771, 0.9888, 0.9924, 0.2070],
          [0.9743, 0.9695, 0.9179, 0.9430, 0.6124],
          [0.8971, 0.9160, 0.9254, 0.9240, 0.2227],
          [0.9485, 0.9466, 0.9552, 0.9772, 0.2920],
          [0.9485, 0.9351, 0.9440, 0.9658, 0.1299],
          [0.9596, 0.9695, 0.9776, 1.0000, 0.2316],
          [1.0000, 0.9924, 1.0000, 1.0000, 0.1626],
          [0.9926, 0.9809, 0.9739, 0.9620, 0.2524],
          [0.9338, 0.9351, 0.9440, 0.9392, 0.1345]],
 
         [[0.9743, 0.9771, 0.9888, 0.9924, 0.2070],
          [0.9743, 0.9695, 0.9179, 0.9430, 0.6124],
          [0.8971, 0.9160, 0.9254, 0.9240, 0.2227],
          [0.9485, 0.9466, 0.9552, 0.9772, 0.2920],
          [0.9485, 0.9351, 0.9440, 0.9658, 0.1299],
          [0.9596, 0.9695, 0.9776, 1.0000, 0.2316],
          [1.0000, 0.9924, 1.0000, 1.0000, 0.1626],
          [0.9926, 0.9809, 0.9739, 0.9620, 0.2524],
          [0.9338, 0.9351, 0.9440, 0.9392, 0.1345],
    

In [15]:
batch["y"][:,0].view(-1,1,1).shape

torch.Size([2, 1, 1])

# Seq2Seq 모델 클래스 만들기

In [16]:
class Net(torch.nn.Module):
    def __init__(self,
                 input_size_enc,
                 input_size_dec,
                 hidden_size,
                 device="cpu",
                 is_attention=False):
        super().__init__()

        self.encoder = torch.nn.LSTM(input_size_enc, hidden_size, batch_first=True)
        self.dec_rnn = torch.nn.LSTM(input_size_dec, hidden_size, batch_first=True)
        self.attn_key_layer = torch.nn.Linear(hidden_size, hidden_size) # 어텐션 키 생성 레이어

        if is_attention:
            hidden_size *= 2

        self.fc_out = torch.nn.Linear(hidden_size, 1)

        self.device = device
        self.is_attention = is_attention


    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: batch, seq, feature
        # trg: batch, seq
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        prediction = torch.zeros(batch_size, trg_len).to(self.device) # batch, seq

        enc_outputs, (hn, cn) = self.encoder(src)
        dec_input = trg[:, 0].view(-1,1,1) # batch -> batch, 1, 1
        
        for t in range(1, trg_len):
            pred, hn, cn = self.decoder(dec_input, enc_outputs, hn, cn)
            prediction[:, t] = pred.view(-1) # batch, 1 -> batch x 1 형태의 벡터로 변경해서 전달

            dec_input = pred.view(-1,1,1) # batch, 1 -> batch, 1, 1
            if random.random() < teacher_forcing_ratio: # 교사강요
                dec_input = trg[:,t].view(-1,1,1) # batch,1,1

        return prediction[:,1:] # 예측해야하는 길이를 슬라이싱하여 리턴
    def decoder(self,x, enc_outputs ,hn, cn):
        # x: batch, 1, 1
        # hn, cn: nlayer, batch, feature
        outputs, (hn,cn) = self.dec_rnn(x, (hn, cn) ) # outputs: batch, seq, features
        x = hn[-1] # batch, features

        if self.is_attention:
            attn_values = self.attention(enc_outputs, hn)
            x = torch.cat([x, attn_values], dim=1)

        pred = self.fc_out(x) # batch, 종가(1)

        return pred, hn, cn


    def attention(self, enc_outputs, hn):
        # 어텐션 키 생성
        # attn_key는 enc_outputs와 shape 같다!
        attn_key = self.attn_key_layer(enc_outputs) # batch, seq, feature

        # batch, seq, feature-> seq, batch , feature-> seq, batch x features
        attn_key = attn_key.permute(1,0,2).flatten(1)

        # nlayer,batch, features -> batch, features -> batch x features, 1
        attn_query = hn[-1].view(-1,1)

        attn_scores = torch.matmul(attn_key, attn_query) # seq, 1
        attn_scores = torch.nn.functional.softmax(attn_scores, dim=0)

        # seq, 1 -> 1, 1, seq -> batch, 1, seq
        attn_scores = attn_scores.view(1,1,-1).repeat(enc_outputs.shape[0],1,1)
        attn_values = torch.bmm(attn_scores, enc_outputs) # batch, 1 ,features

        return attn_values[:,-1] # batch, features

In [17]:
model = Net(x_arr.shape[-1], 1, 64, is_attention=True)
model(batch["x"], batch["y"], teacher_forcing_ratio=0.5)

tensor([[0.0175, 0.0156, 0.0170, 0.0152, 0.0145],
        [0.0172, 0.0155, 0.0170, 0.0153, 0.0146]], grad_fn=<SliceBackward0>)

In [18]:
batch["y"].shape

torch.Size([2, 6])

In [19]:
model

Net(
  (encoder): LSTM(5, 64, batch_first=True)
  (dec_rnn): LSTM(1, 64, batch_first=True)
  (attn_key_layer): Linear(in_features=64, out_features=64, bias=True)
  (fc_out): Linear(in_features=128, out_features=1, bias=True)
)

# 학습 loop 함수

In [20]:
def train_loop(dataloader, model, loss_fn, optimizer, device, teacher_forcing_ratio):
    epoch_loss = 0
    model.train() # 학습 모드
    
    for batch in dataloader:
        x = batch["x"].to(device)
        y = batch["y"].to(device) # batch, seq(6)
        
        pred = model( x, y, teacher_forcing_ratio) # pred : batch, seq(5)
        loss = loss_fn( pred, y[:,1:] )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 검증 or 테스트 loop 함수

In [21]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    model.eval() # 평가 모드

    for batch in dataloader:
        x = batch["x"].to(device)
        y = batch["y"].to(device)
        
        pred = model( x,y,0 ) # 교사강요 안하기 위해 세번째 인수로 0 전달
        loss = loss_fn( pred, y[:,1:] )
        epoch_loss += loss.item()

        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

# 하이퍼 파라미터 정의

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
n_splits = 5
batch_size = 32 # 배치 사이즈
loss_fn = torch.nn.MSELoss() # 손실 객체
epochs = 500 # 최대 가능한 에폭수
input_size_enc = x_arr.shape[-1] # 인코디 입력 피처개수
input_size_dec = 1 # 디코더 입력 피처개수
hidden_size = 32 # rnn layer의 출력 피처개수
is_attention = True
teacher_forcing_ratio = 0.5 # 교사강요 학습

cv = KFold(n_splits, shuffle=True, random_state=SEED)

# 학습

In [24]:
is_holdout = False
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []
for i, (tri, vai) in enumerate( cv.split(x_arr) ):
    # 학습용 데이터로더 객체
    train_dt = FinanceDataset(x_arr[tri], y_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = FinanceDataset(x_arr[vai], y_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(input_size_enc, input_size_dec, hidden_size, device, is_attention).to(device)
    optimizer = torch.optim.Adam( model.parameters() )

    best_score = np.inf # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device, teacher_forcing_ratio)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        pred = pred * sizes[3] + mins[3] # batch, seq(5)
        true = y_arr[vai] * sizes[3] + mins[3] # batch, seq(6)
        score = mean_absolute_error(true[:,1:], pred)

        # print(train_loss, valid_loss, score)
        if score < best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"model_{i}.pth") # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 100:
            break

    print(f"{i}번째 폴드 최고 MAE: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break

  0%|          | 0/500 [00:00<?, ?it/s]

0번째 폴드 최고 MAE: 963.5095079787236


  0%|          | 0/500 [00:00<?, ?it/s]

1번째 폴드 최고 MAE: 1143.696126994681


  0%|          | 0/500 [00:00<?, ?it/s]

2번째 폴드 최고 MAE: 833.3038043478261


  0%|          | 0/500 [00:00<?, ?it/s]

3번째 폴드 최고 MAE: 946.4512058423912


  0%|          | 0/500 [00:00<?, ?it/s]

4번째 폴드 최고 MAE: 882.5005264945652


In [25]:
np.mean(best_score_list)

953.8922343316375

# 테스트셋
- 3개월동안 매일매일 5일씩 예측

In [72]:
test_df = web.DataReader('005930', 'naver', start='2023-01-01', end='2023-03-31')
test_df = test_df.astype(int)
test_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-02,55500,56100,55200,55500,10031448
2023-01-03,55400,56000,54500,55400,13547030
2023-01-04,55700,58000,55600,57800,20188071
2023-01-05,58200,58800,57600,58200,15682826
2023-01-06,58300,59400,57900,59000,17334989
...,...,...,...,...,...
2023-03-27,62600,62800,62000,62100,11039331
2023-03-28,62400,62900,62100,62900,11614118
2023-03-29,62500,62700,62200,62700,11216008
2023-03-30,63700,63700,63100,63200,15684377


In [73]:
test_df = pd.concat([ df.iloc[-seq_len:], test_df ])
test_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-12-16,58300,59500,58300,59500,13033596
2022-12-19,59500,59900,59100,59500,7696187
2022-12-20,59000,59100,58500,58600,9284761
2022-12-21,58700,59100,58000,58000,10356971
2022-12-22,58100,59100,58100,59100,10720630
...,...,...,...,...,...
2023-03-27,62600,62800,62000,62100,11039331
2023-03-28,62400,62900,62100,62900,11614118
2023-03-29,62500,62700,62200,62700,11216008
2023-03-30,63700,63700,63100,63200,15684377


In [77]:
test_data = test_df.to_numpy()
x_test, y_test = transform_data(test_data, mins, sizes, seq_len, pred_len)
x_test.shape, y_test.shape

((58, 10, 5), (58, 6))

In [80]:
test_target = np.zeros_like(y_test)
test_target.shape

(58, 6)

In [82]:
test_target[:, 0] = y_test[:, 0]
test_target

array([[0.1026616 , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.11026616, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.10646388, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.19771863, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.21292776, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.24334601, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.30798479, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.29657795, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.30038023, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.30038023, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.31178707, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.32319392, 0

In [85]:
test_dt = FinanceDataset(x_test, test_target)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle=False)

In [88]:
next(iter(test_dl))["y"]

tensor([[0.1027, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1103, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1065, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2129, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2433, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3080, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2966, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3118, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3232, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3194, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2966, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3384, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3498, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4106, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.429

In [96]:
pred_list = []
for i in range(n_splits):
    model = Net(input_size_enc, input_size_dec, hidden_size, device, is_attention)
    state_dict = torch.load(f"model_{i}.pth")
    model.load_state_dict(state_dict)
    _, pred = test_loop(test_dl, model, loss_fn, device)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)
pred.shape

(58, 5)

In [101]:
true = y_test * sizes[3] + mins[3]
pred = pred * sizes[3] + mins[3]

In [104]:
true.shape

(58, 6)

In [106]:
mean_absolute_error(true[:,1:], pred)

1397.0718884698276

In [107]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(true[:,1:], pred)

0.0225843959016861