In [3]:
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
import pandas_datareader.data as web
df = web.DataReader('005930', 'naver', start='2023-01-01', end='2023-12-31')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-02,55500,56100,55200,55500,10031448
2023-01-03,55400,56000,54500,55400,13547030
2023-01-04,55700,58000,55600,57800,20188071
2023-01-05,58200,58800,57600,58200,15682826
2023-01-06,58300,59400,57900,59000,17334989


In [5]:
df = df.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 245 entries, 2023-01-02 to 2023-12-28
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Open    245 non-null    int32
 1   High    245 non-null    int32
 2   Low     245 non-null    int32
 3   Close   245 non-null    int32
 4   Volume  245 non-null    int32
dtypes: int32(5)
memory usage: 6.7 KB


In [6]:
data = df.to_numpy()
data

array([[   55500,    56100,    55200,    55500, 10031448],
       [   55400,    56000,    54500,    55400, 13547030],
       [   55700,    58000,    55600,    57800, 20188071],
       ...,
       [   76100,    76700,    75700,    76600, 13164909],
       [   76700,    78000,    76500,    78000, 20651042],
       [   77700,    78500,    77500,    78500, 17797536]])

In [7]:
mins = data.min(axis=0)
sizes = data.max(axis=0) - mins

(data - mins) / sizes

array([[0.0044843 , 0.00444444, 0.03043478, 0.004329  , 0.17389595],
       [0.        , 0.        , 0.        , 0.        , 0.31921842],
       [0.01345291, 0.08888889, 0.04782609, 0.1038961 , 0.59373696],
       ...,
       [0.92825112, 0.92      , 0.92173913, 0.91774892, 0.3034228 ],
       [0.95515695, 0.97777778, 0.95652174, 0.97835498, 0.61287465],
       [1.        , 1.        , 1.        , 1.        , 0.4949202 ]])

In [8]:
def transform_data(data, mins, sizes, seq_len = 10, pred_len = 5):
    data = (data - mins) / sizes
    x_list = [] # src
    y_list = [] # trg

    for i in range(seq_len, data.shape[0]+1-pred_len):
        x = data[i-seq_len:i]
        y = data[i-1:i+pred_len, 3] # i-1: sos 토큰
        x_list.append(x)
        y_list.append(y)

    x_arr = np.array(x_list)
    y_arr = np.array(y_list)
    return x_arr, y_arr

In [9]:
seq_len = 10
pred_len = 5
x_arr , y_arr = transform_data(data, mins, sizes, seq_len, pred_len)
x_arr.shape, y_arr.shape

((231, 10, 5), (231, 6))

# 데이터셋 클래스

```
[이전일 실제값(sos), 0, 0, 0, 0, 0]
[이전일 실제값(sos), 0, 0, 0, 0, 0]
[이전일 실제값(sos), 0, 0, 0, 0, 0]
...
```

In [10]:
class FinanceDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x, self.y = x, y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, i):
        return {
            "x": torch.Tensor(self.x[i]),
            "y": torch.Tensor(self.y[i])
        }

In [11]:
dataset = FinanceDataset(x_arr, y_arr)
dataloader = torch.utils.data.DataLoader(dataset, 2)
batch = next(iter(dataloader))
batch # x shape: batch, seq, feature / y shape: batch, seq

{'x': tensor([[[0.0045, 0.0044, 0.0304, 0.0043, 0.1739],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.3192],
          [0.0135, 0.0889, 0.0478, 0.1039, 0.5937],
          [0.1256, 0.1244, 0.1348, 0.1212, 0.4075],
          [0.1300, 0.1511, 0.1478, 0.1558, 0.4758],
          [0.1928, 0.2089, 0.2217, 0.2294, 0.5297],
          [0.2152, 0.2267, 0.2348, 0.2165, 0.3735],
          [0.2511, 0.2311, 0.2522, 0.2208, 0.2681],
          [0.2556, 0.2311, 0.2348, 0.2208, 0.4249],
          [0.2287, 0.2311, 0.2565, 0.2338, 0.2764]],
 
         [[0.0000, 0.0000, 0.0000, 0.0000, 0.3192],
          [0.0135, 0.0889, 0.0478, 0.1039, 0.5937],
          [0.1256, 0.1244, 0.1348, 0.1212, 0.4075],
          [0.1300, 0.1511, 0.1478, 0.1558, 0.4758],
          [0.1928, 0.2089, 0.2217, 0.2294, 0.5297],
          [0.2152, 0.2267, 0.2348, 0.2165, 0.3735],
          [0.2511, 0.2311, 0.2522, 0.2208, 0.2681],
          [0.2556, 0.2311, 0.2348, 0.2208, 0.4249],
          [0.2287, 0.2311, 0.2565, 0.2338, 0.2764],
    

# Seq2Seq

In [12]:
class Net(torch.nn.Module):
    def __init__(self, enc_feature_size, dec_feature_size, hidden_size, device="cpu"):
        super().__init__()
        self.encoder = torch.nn.LSTM(enc_feature_size, hidden_size, batch_first=True)
        self.dec_rnn = torch.nn.LSTM(dec_feature_size, hidden_size, batch_first=True)
        self.fc_out = torch.nn.Linear(hidden_size * 2, 1)
        self.attn_key_layer = torch.nn.Linear(hidden_size, hidden_size)
        self.device = device

    def decoder(self, x, enc_outputs, hn, cn): # input으로 들어오는 y shape: batch, seq(1), feature(1)
        _, (hn, cn) = self.dec_rnn(x, (hn, cn)) # hn: nlayer(1), batch, feature

        # enc_outputs: batch, seq, feature
        attn_key = self.attn_key_layer(enc_outputs)
        attn_key = attn_key.permute(1, 0, 2).flatten(1) # seq, batch * feature
        attn_query = hn.view(-1, 1) # batch * feature, 1

        # 어텐션 스코어
        # seq, batch * feature @ batch * feature, 1 = seq, 1
        attn_scores = torch.matmul(attn_key, attn_query)
        attn_scores = torch.nn.functional.softmax(attn_scores, 0)

        # 어텐션 밸류(컨텍스트 벡터)를 얻기 위해 배치 단위 행렬 연산 수행
        # batch, 1, seq @ batch, seq, feature= batch, 1, feature
        # attn_scores: seq, 1 -> 1, 1, seq -> batch, 1 ,seq
        attn_scores = attn_scores.view(1, 1, -1).repeat(enc_outputs.shape[0], 1, 1)
        attn_values = torch.bmm(attn_scores, enc_outputs)
        x = torch.cat([hn[-1], attn_values[:, -1]], 1)

        pred = self.fc_out(x) # batch, feature 형태를 입력으로 전달
        return pred, hn, cn

    def forward(self, src, trg, teacher_forcing_ratio=0.5): # src shape: batch, seq, feature / trg shape: batch, seq
        batch_size, trg_len = trg.shape[0], trg.shape[1]
        prediction = torch.zeros(batch_size, trg_len).to(self.device)

        enc_outputs, (hn, cn) = self.encoder(src)

        # batch, seq -> batch, seq(1), feature(1)
        dec_input = trg[:, 0].view(-1, 1, 1)
        for t in range(1, trg_len):
            pred, hn, cn = self.decoder(dec_input, enc_outputs, hn ,cn) # pred shape: batch, seq(1)
            prediction[:, t] = pred.view(-1)

            # batch, seq(1), feature(1)
            dec_input = trg[:, t].view(-1, 1, 1) if random.random() < teacher_forcing_ratio else pred.view(-1, 1, 1)

        return prediction[:, 1:]

In [13]:
model = Net(5, 1, 64)
model(batch["x"], batch["y"])

tensor([[0.0729, 0.0616, 0.0556, 0.0519, 0.0494],
        [0.0723, 0.0611, 0.0551, 0.0514, 0.0489]], grad_fn=<SliceBackward0>)

In [14]:
def train_loop(dataloader, model, loss_fn, optimizer, device, teacher_forcing_ratio):
    model.train() # 학습 모드로 전환
    epoch_loss = 0

    for batch in dataloader: # 배치단위로 데이터 반환
        src, trg = batch["x"].to(device), batch["y"].to(device)
        pred = model(src, trg, teacher_forcing_ratio) # 예측
        loss = loss_fn(pred, trg[:, 1:]) # 손실 계산

        optimizer.zero_grad() # 이전 경사 누적을 방지하기 위해 0으로 초기화
        loss.backward() # 역전파
        optimizer.step() # 가중치 업데이트

        epoch_loss += loss.item() # 에폭 loss 를 계산하기 위해 배치 loss 합

    epoch_loss /= len(dataloader) # 에폭 loss 구함

    return epoch_loss

In [15]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    model.eval() # 평가 모드
    epoch_loss = 0
    pred_list = []

    for batch in dataloader:
        src, trg = batch["x"].to(device), batch["y"].to(device)
        pred = model(src, trg, 0)

        loss = loss_fn(pred, trg[:, 1:])
        epoch_loss += loss.item()

        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

In [16]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

n_splits = 5
hidden_size = 32
batch_size = 32
epochs = 1000
cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
device = "cuda" if torch.cuda.is_available() else "cpu"
loss_function = torch.nn.MSELoss()
enc_feature_size = x_arr.shape[-1]
dec_feature_size = 1
teacher_forcing_ratio = 0.5

In [17]:
is_holdout = False
reset_seeds(42)
score_list = []

for i, (tri, vai) in enumerate(cv.split(x_arr)):
    # 학습데이터
    train_dataset = FinanceDataset(x_arr[tri], y_arr[tri])
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 검증데이터
    valid_dataset = FinanceDataset(x_arr[vai], y_arr[vai])
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    model = Net(enc_feature_size, dec_feature_size, hidden_size, device).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    best_score = np.inf
    patience = 0
    for _ in tqdm(range(epochs)):
        train_loss = train_loop(train_dataloader, model, loss_function, optimizer, device, teacher_forcing_ratio)
        valid_loss, pred = test_loop(valid_dataloader, model, loss_function, device)

        pred = pred * sizes[3] + mins[3]
        true = y_arr[vai, 1:] * sizes[3] + mins[3]
        score = mean_absolute_error(true, pred)
        patience += 1

        if score < best_score:
            patience = 0
            best_score = score
            torch.save(model.state_dict(), f"../output/model_{i}.pt")

        if patience == 5:
            break

    score_list.append(best_score)
    print(f"Fold {i} MAE: {best_score}")

    if is_holdout:
        break

  0%|          | 0/1000 [00:00<?, ?it/s]

Fold 0 MAE: 1071.299767287234


  0%|          | 0/1000 [00:00<?, ?it/s]

Fold 1 MAE: 1503.9020720108697


  0%|          | 0/1000 [00:00<?, ?it/s]

Fold 2 MAE: 1082.8987092391305


  0%|          | 0/1000 [00:00<?, ?it/s]

Fold 3 MAE: 1215.8969259510868


  0%|          | 0/1000 [00:00<?, ?it/s]

Fold 4 MAE: 1067.4927139945653


In [18]:
np.mean(score_list)

1188.2980376965775

# 테스트 평가

In [19]:
test_df = web.DataReader('005930', 'naver', start='2024-01-01', end='2024-3-31')
test_df = test_df.astype(int)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 61 entries, 2024-01-02 to 2024-03-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Open    61 non-null     int32
 1   High    61 non-null     int32
 2   Low     61 non-null     int32
 3   Close   61 non-null     int32
 4   Volume  61 non-null     int32
dtypes: int32(5)
memory usage: 1.7 KB


In [20]:
seq_len

10

In [21]:
df_list = [df.iloc[-seq_len:], test_df]
test_data = pd.concat(df_list).to_numpy()
test_data.shape

(71, 5)

In [22]:
x_test, y_test = transform_data(test_data, mins, sizes)
x_test.shape, y_test.shape

((57, 10, 5), (57, 6))

In [23]:
pred_len, seq_len

(5, 10)

- 실제 예측 시 다음과 같이 trg 데이터를 만들어 줘야 함

In [24]:
test_sos = np.zeros_like(y_test)
test_sos[:, 0] = x_test[:, -1, 3]
test_sos.shape

(57, 6)

In [25]:
test_dataset = FinanceDataset(x_test, test_sos)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=False)

In [28]:
pred_list = []

for i in range(n_splits):
    model = Net(enc_feature_size, dec_feature_size, hidden_size, device)
    state_dict = torch.load(f"../output/model_{i}.pt", weights_only=True)
    model.load_state_dict(state_dict)
    model.to(device)

    _, pred = test_loop(test_dataloader, model, loss_function, device)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)
pred.shape

(57, 5)

In [29]:
pred = pred * sizes[3] + mins[3]
true = y_test[:, 1:] * sizes[3] + mins[3]
mean_absolute_error(true, pred)

1476.8640625000003