## simple RNN
- dataset : https://www.kaggle.com/datasets/iveeaten3223times/massive-yahoo-finance-dataset
- 연속형(=시계열, 문장 등) 데이터 처리에 유리

## 데이터 관련

In [1]:
# 데이터 로드
import pandas as pd
# df_stock = pd.read_csv('/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv')
# df_stock.head(3)

df_stock_all = pd.read_csv('/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv')
df_stock_all.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL


In [2]:
# 특정 회사(컴퍼니)만 시계열 가져오기 (나오게)
selected_company = 'AAPL'
df_stock = df_stock_all[df_stock_all['Company'] == selected_company]
df_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
458,2018-11-30 00:00:00-05:00,43.261071,43.270671,42.478826,42.850754,158126000,0.0,0.0,AAPL
916,2018-12-03 00:00:00-05:00,44.261681,44.376858,43.481835,44.348064,163210000,0.0,0.0,AAPL
1374,2018-12-04 00:00:00-05:00,43.419445,43.764977,42.296468,42.397247,165377200,0.0,0.0,AAPL
1832,2018-12-06 00:00:00-05:00,41.21428,41.938938,40.892744,41.924541,172393600,0.0,0.0,AAPL


### 데이터 전처리

In [3]:
# 날짜를 datetime 변환
df_stock['Date'] = pd.to_datetime(df_stock['Date'], utc=True)
df_stock['Date'].dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stock['Date'] = pd.to_datetime(df_stock['Date'], utc=True)


datetime64[ns, UTC]

In [4]:
# df_stock.info()

In [5]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_stock[['Open','High','Low','Close','Volume']] = scaler.fit_transform(df_stock[['Open','High','Low','Close','Volume']])
df_stock.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stock[['Open','High','Low','Close','Volume']] = scaler.fit_transform(df_stock[['Open','High','Low','Close','Volume']])


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 05:00:00+00:00,-1.598199,-1.611253,-1.611061,-1.615618,1.176835,0.0,0.0,AAPL
458,2018-11-30 05:00:00+00:00,-1.61031,-1.623753,-1.614516,-1.620572,1.010309,0.0,0.0,AAPL
916,2018-12-03 05:00:00+00:00,-1.589,-1.600423,-1.592958,-1.588704,1.104861,0.0,0.0,AAPL


In [6]:
df_stock.shape

(1258, 9)

In [7]:
# 시계열 데이터 생성
import numpy as np
import torch

sequence_length = 5 # 5일 단위

from tqdm import tqdm


def create_sequences(data, seq_length):
    xs = []
    ys = []
    # for i in range(len(data) - seq_length):
    for i in tqdm(range(len(data) - seq_length), desc = 'Generating Sequences'):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys) # features, label

features, label = create_sequences(df_stock['Close'].values, sequence_length)
features.shape, label.shape

Generating Sequences: 100%|██████████| 1253/1253 [00:00<00:00, 929183.68it/s]


((1253, 5), (1253,))

In [8]:
features[:3]

array([[-1.61561817, -1.62057201, -1.58870387, -1.63022425, -1.64028513],
       [-1.62057201, -1.58870387, -1.63022425, -1.64028513, -1.67210203],
       [-1.58870387, -1.63022425, -1.64028513, -1.67210203, -1.66643324]])

In [9]:
# 텐서 변환
features_tensor = torch.tensor(features, dtype = torch.float32).unsqueeze(-1)
label_tensor = torch.tensor(label, dtype = torch.float32).unsqueeze(-1)
features_tensor.shape, label_tensor.shape

(torch.Size([1253, 5, 1]), torch.Size([1253, 1]))

## 모델관련

In [10]:
# RNN 모델 정의
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

input_size = 1 # row 단위 갯수
hidden_size = 5 # 은닉층 갯수
output_size = 1 

model = SimpleRNN(input_size, hidden_size, output_size)
model

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [11]:
# 모델 학습
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr = 0.01)

epochs = 100

for epoch in tqdm(range(epochs), desc = 'learning Model'):
    model.train()
    outputs = model(features_tensor)
    optimizer.zero_grad()
    loss = criterion(outputs, label_tensor) # error율 확인
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0: # 10회에 1번 출력
        print(f'Epoch [{epoch+1} / {epochs}]. Loss : {loss.item():.5f}')

learning Model:  63%|██████▎   | 63/100 [00:00<00:00, 192.87it/s]

Epoch [10 / 100]. Loss : 0.37644
Epoch [20 / 100]. Loss : 0.07579
Epoch [30 / 100]. Loss : 0.05108
Epoch [40 / 100]. Loss : 0.01282
Epoch [50 / 100]. Loss : 0.01244
Epoch [60 / 100]. Loss : 0.00715
Epoch [70 / 100]. Loss : 0.00449


learning Model: 100%|██████████| 100/100 [00:00<00:00, 183.76it/s]

Epoch [80 / 100]. Loss : 0.00372
Epoch [90 / 100]. Loss : 0.00333
Epoch [100 / 100]. Loss : 0.00305





In [12]:
model.state_dict()

OrderedDict([('rnn.weight_ih_l0',
              tensor([[-0.6295],
                      [ 0.6235],
                      [-0.5685],
                      [ 0.3158],
                      [ 0.2052]])),
             ('rnn.weight_hh_l0',
              tensor([[-0.4046, -0.0246, -0.1486, -0.4862,  0.1588],
                      [ 0.3081,  0.3946, -0.4671, -0.4152, -0.3169],
                      [ 0.1670, -0.4396, -0.0220,  0.3032,  0.0500],
                      [-0.6185,  0.2768,  0.0188, -0.3537,  0.0373],
                      [ 0.0356,  0.0028, -0.5810,  0.2785,  0.2676]])),
             ('rnn.bias_ih_l0',
              tensor([-0.0375,  0.1491,  0.3545, -0.0723, -0.5514])),
             ('rnn.bias_hh_l0',
              tensor([ 0.6611,  0.1004,  0.1232, -0.2875,  0.1258])),
             ('fc.weight',
              tensor([[-0.5663,  0.6525, -0.4088,  0.2000,  0.5054]])),
             ('fc.bias', tensor([0.1897]))])

## 평가 관련

In [13]:
model.eval()

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [14]:
with torch.no_grad():
    predicted = model(features_tensor[:5])
predicted, label_tensor[:5]

(tensor([[-1.5842],
         [-1.6024],
         [-1.6043],
         [-1.6072],
         [-1.6067]]),
 tensor([[-1.6721],
         [-1.6664],
         [-1.6714],
         [-1.6690],
         [-1.6595]]))