## simple RNN
- dataset : https://www.kaggle.com/datasets/iveeaten3223times/massive-yahoo-finance-dataset
- 연속형(시계열, 문장 등) 데이터 처리에 유리

## 데이터 관련

In [1]:
# 데이터 로드
import pandas as pd
df_stock_all = pd.read_csv('/kaggle/input/massive-yahoo-finance-dataset/stock_details_5_years.csv')
df_stock_all.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL


In [2]:
# 특정 회사 시계열 가져오기
selected_company = 'AAPL'
df_stock = df_stock_all[df_stock_all['Company'] == selected_company].copy()
df_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
458,2018-11-30 00:00:00-05:00,43.261071,43.270671,42.478826,42.850754,158126000,0.0,0.0,AAPL
916,2018-12-03 00:00:00-05:00,44.261681,44.376858,43.481835,44.348064,163210000,0.0,0.0,AAPL
1374,2018-12-04 00:00:00-05:00,43.419445,43.764977,42.296468,42.397247,165377200,0.0,0.0,AAPL
1832,2018-12-06 00:00:00-05:00,41.21428,41.938938,40.892744,41.924541,172393600,0.0,0.0,AAPL


#### 데이터 전처리

In [3]:
# 날짜를 datetime 변환
df_stock['Date'] = pd.to_datetime(df_stock['Date'], utc=True)
df_stock['Date'].dtype

datetime64[ns, UTC]

In [4]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1258 entries, 0 to 602471
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   Date          1258 non-null   datetime64[ns, UTC]
 1   Open          1258 non-null   float64            
 2   High          1258 non-null   float64            
 3   Low           1258 non-null   float64            
 4   Close         1258 non-null   float64            
 5   Volume        1258 non-null   int64              
 6   Dividends     1258 non-null   float64            
 7   Stock Splits  1258 non-null   float64            
 8   Company       1258 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(6), int64(1), object(1)
memory usage: 98.3+ KB


In [5]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_stock[['Open','High', 'Low','Close','Volume']] = scaler.fit_transform(df_stock[['Open','High', 'Low','Close','Volume']])
df_stock

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 05:00:00+00:00,-1.598199,-1.611253,-1.611061,-1.615618,1.176835,0.0,0.0,AAPL
458,2018-11-30 05:00:00+00:00,-1.610310,-1.623753,-1.614516,-1.620572,1.010309,0.0,0.0,AAPL
916,2018-12-03 05:00:00+00:00,-1.589000,-1.600423,-1.592958,-1.588704,1.104861,0.0,0.0,AAPL
1374,2018-12-04 05:00:00+00:00,-1.606937,-1.613328,-1.618436,-1.630224,1.145166,0.0,0.0,AAPL
1832,2018-12-06 05:00:00+00:00,-1.653900,-1.651840,-1.648607,-1.640285,1.275657,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
600507,2023-11-22 05:00:00+00:00,1.546464,1.532611,1.574085,1.539173,-1.193708,0.0,0.0,AAPL
600998,2023-11-24 05:00:00+00:00,1.533260,1.489798,1.540125,1.510653,-1.483268,0.0,0.0,AAPL
601489,2023-11-27 05:00:00+00:00,1.513029,1.484947,1.532602,1.506822,-1.176321,0.0,0.0,AAPL
601980,2023-11-28 05:00:00+00:00,1.510047,1.493594,1.543349,1.519805,-1.216944,0.0,0.0,AAPL


## 모델 관련

In [6]:
# 시계열 데이터 생성
import numpy as np
import torch
sequence_length = 5   #5일 단위

from tqdm import tqdm
def create_sequences(data, seq_length):
    xs = []
    ys = []
    #for i in range(len(data) - seq_length):
    for i in tqdm(range(len(data) - seq_length), desc='Generating Sequences') :
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)   #features, label

features, label = create_sequences(df_stock['Close'].values, sequence_length)
features.shape, label.shape

Generating Sequences: 100%|██████████| 1253/1253 [00:00<00:00, 1030685.02it/s]


((1253, 5), (1253,))

In [7]:
features[:3]

array([[-1.61561817, -1.62057201, -1.58870387, -1.63022425, -1.64028513],
       [-1.62057201, -1.58870387, -1.63022425, -1.64028513, -1.67210203],
       [-1.58870387, -1.63022425, -1.64028513, -1.67210203, -1.66643324]])

pytorch에 넣으려면 tensor로 바꿔야함

In [8]:
# 텐서 변환
features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(-1)
label_tensor = torch.tensor(label, dtype=torch.float32).unsqueeze(-1)
features_tensor.shape, label_tensor.shape


(torch.Size([1253, 5, 1]), torch.Size([1253, 1]))

In [9]:
# RNN 모델 정의
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

input_size = 1  # row 단위 개수
hidden_size = 5  # 은닉층 개수
output_size = 1 

model = SimpleRNN(input_size, hidden_size, output_size)
model

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [10]:
# 모델 학습
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

epochs = 100

for epoch in tqdm(range(epochs), desc='learning Model'):
    model.train()
    outputs = model(features_tensor)
    optimizer.zero_grad()
    loss = criterion(outputs, label_tensor)  # 에러율 확인
    loss.backward()
    optimizer.step()

    if(epoch+1) % 10 == 0: # 10회에 1번 출력
        print(f'Epoch [{epoch+1} / {epochs}], Loss : {loss.item():.5f}')

learning Model:  71%|███████   | 71/100 [00:00<00:00, 223.42it/s]

Epoch [10 / 100], Loss : 0.47816
Epoch [20 / 100], Loss : 0.07910
Epoch [30 / 100], Loss : 0.08827
Epoch [40 / 100], Loss : 0.03762
Epoch [50 / 100], Loss : 0.03288
Epoch [60 / 100], Loss : 0.02205
Epoch [70 / 100], Loss : 0.01463


learning Model: 100%|██████████| 100/100 [00:00<00:00, 207.41it/s]

Epoch [80 / 100], Loss : 0.01033
Epoch [90 / 100], Loss : 0.00763
Epoch [100 / 100], Loss : 0.00589





In [11]:
model.state_dict()

OrderedDict([('rnn.weight_ih_l0',
              tensor([[-0.1848],
                      [ 0.2535],
                      [ 0.1984],
                      [ 0.3694],
                      [ 0.5889]])),
             ('rnn.weight_hh_l0',
              tensor([[ 0.3832, -0.2278, -0.2445,  0.5269, -0.5165],
                      [-0.3383, -0.2704,  0.1495, -0.0471,  0.1977],
                      [-0.2578,  0.5904, -0.4158,  0.2235,  0.2613],
                      [-0.0369,  0.5400,  0.4035, -0.0715,  0.5645],
                      [-0.3134,  0.1617, -0.0820, -0.2784, -0.0159]])),
             ('rnn.bias_ih_l0',
              tensor([-0.2838,  0.2514, -0.2364, -0.0190,  0.0262])),
             ('rnn.bias_hh_l0',
              tensor([ 0.0992, -0.0221, -0.2050,  0.1033, -0.1578])),
             ('fc.weight',
              tensor([[-0.4215,  0.2285,  0.3170,  0.4829,  0.6062]])),
             ('fc.bias', tensor([-0.0897]))])

# 평가 관련

In [12]:
model.eval()

SimpleRNN(
  (rnn): RNN(1, 5, batch_first=True)
  (fc): Linear(in_features=5, out_features=1, bias=True)
)

In [13]:
with torch.no_grad():
    predicted = model(features_tensor[:5])
predicted, label_tensor[:5]

(tensor([[-1.5423],
         [-1.5550],
         [-1.5589],
         [-1.5617],
         [-1.5621]]),
 tensor([[-1.6721],
         [-1.6664],
         [-1.6714],
         [-1.6690],
         [-1.6595]]))