In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

from scipy.stats import norm

import math

from sklearn.preprocessing import MinMaxScaler

import plotly.graph_objects as go

In [17]:
# 리스트로 들어간 feature 삭제 및 5개 기업 선택
df = pd.read_csv('dataset-evalPRPR-SPX-nonan.csv')

In [18]:
drop_features = ['P','P_future','R','R_future','company', 'calculated_price']
df_drop = df.drop(columns=drop_features)
# df_5 = df_drop[df['ticker'].isin(['MMM', 'VRTX', 'MRK', 'MDT', 'MCK'])] # 5개 기업의 데이터만
df_500 = df_drop.copy()

In [19]:
pd.set_option('display.max_rows', 60)
df_500['ticker'].value_counts()

ticker_counts = df_500['ticker'].value_counts()
ticker_list = ticker_counts[ticker_counts == 509].index.tolist()
df_500 = df_500[df_500['ticker'].isin(ticker_list)]
df_500 = df_500.sort_values(['ticker', 'timestamp'])
df_500.drop(df_500.columns[2:12], axis=1, inplace=True)
df_500 # 총 주 수가 509개인 446개 기업만 선택

Unnamed: 0,ticker,timestamp,open,high,low,close,volume,return,SPX
6108,A,2013-05-03,41.64,42.5500,40.950,41.55,16896400.0,0.006053,1614.42
6109,A,2013-05-10,41.44,43.8500,41.390,43.63,16284100.0,0.050060,1633.70
6110,A,2013-05-17,43.43,46.4900,42.720,45.56,28768000.0,0.044236,1667.47
6111,A,2013-05-24,45.48,47.4730,45.195,45.59,21143400.0,0.000659,1649.60
6112,A,2013-05-31,46.09,47.0500,45.400,45.45,16114400.0,-0.003071,1630.74
...,...,...,...,...,...,...,...,...,...
241293,ZTS,2022-12-30,145.91,148.5100,143.570,146.55,4950165.0,0.005420,3839.50
241294,ZTS,2023-01-06,148.66,150.8144,141.520,147.64,6958752.0,0.007438,3895.08
241295,ZTS,2023-01-13,149.00,161.4000,147.000,160.92,12618342.0,0.089949,3999.09
241296,ZTS,2023-01-20,160.69,164.9400,158.710,163.81,13851912.0,0.017959,3972.61


In [20]:
# 정규화 
## open, high, low, close은 주가관련인데, 이는 기업별로 다르므로 기업별로 정규화 필요
## SPX은 기업별로 다르지 않음 따라서 통합 정규화
## volume은 기업별로 다르지만 통합 정규화가 올바름
## return은 따로 정규화할 필요는 없을거 같음
## 정규화는 편의를 위해 MinMaxScaler만 사용

scaler = MinMaxScaler()
unique_tickers = df_500['ticker'].unique()

columns_list = df_500.columns.tolist()
print(columns_list)
total_list = [columns_list[-1]] + [columns_list[-3]]
print(total_list)
seperate_list = columns_list[2:6]
print(seperate_list)

# 통합 정규화 : spx, volume
for i in total_list:
    df_500[i] = scaler.fit_transform(df_500[i].values.reshape(-1, 1))

# 개별기업별 정규화 : open, high, low, close	
for i in unique_tickers:
    for j in seperate_list:
        mask = (df_500['ticker'] == i)
        df_500.loc[mask, j] = scaler.fit_transform(df_500.loc[mask, j].values.reshape(-1, 1))

df_500

['ticker', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'return', 'SPX']
['SPX', 'volume']
['open', 'high', 'low', 'close']


Unnamed: 0,ticker,timestamp,open,high,low,close,volume,return,SPX
6108,A,2013-05-03,0.053435,0.054317,0.054616,0.049023,0.012817,0.006053,0.006929
6109,A,2013-05-10,0.052060,0.063289,0.057684,0.063385,0.012352,0.050060,0.013004
6110,A,2013-05-17,0.065745,0.081510,0.066955,0.076711,0.021839,0.044236,0.023644
6111,A,2013-05-24,0.079843,0.088295,0.084208,0.076918,0.016045,0.000659,0.018013
6112,A,2013-05-31,0.084038,0.085375,0.085637,0.075951,0.012223,-0.003071,0.012071
...,...,...,...,...,...,...,...,...,...
241293,ZTS,2022-12-30,0.543884,0.543018,0.537044,0.547929,0.003738,0.005420,0.708017
241294,ZTS,2023-01-06,0.556621,0.553469,0.527507,0.552984,0.005265,0.007438,0.725530
241295,ZTS,2023-01-13,0.558196,0.601479,0.553002,0.614571,0.009566,0.089949,0.758302
241296,ZTS,2023-01-20,0.612338,0.617534,0.607481,0.627974,0.010503,0.017959,0.749958


In [None]:
# 라벨 인코더 생성
label_encoder = LabelEncoder()

# 라벨 인코딩 딕셔너리 저장
encoded_labels = label_encoder.fit_transform(df_500['ticker'])
encoded_dict = dict(zip(df_500['ticker'], encoded_labels))
inverse_encoded_dict = {value: key for key, value in encoded_dict.items()}

# ticker 열 라벨 인코딩
df_500['ticker'] = label_encoder.fit_transform(df_500['ticker'])

print(df_500[['ticker']])
df_500.info()
df_500['ticker'].value_counts()

In [22]:
# 전체 기업 데이터를 이용할 것이기 때문에 날짜 자체가 입력데이터로 들어가야 구분됨
# 'date' 열을 datetime 타입으로 변환
# train / test 시계열 분리

df_500['timestamp'] = pd.to_datetime(df_500['timestamp'])

# 'year', 'month', 'day' 열 추출
df_500['year'] = df_500['timestamp'].dt.year
df_500['month'] = df_500['timestamp'].dt.month
df_500['day'] = df_500['timestamp'].dt.day

df_500_train = df_500.loc[df_500['timestamp'] < '2021-09-01']
# df_500_val = df_500.loc[(df_500['timestamp'] > '2020-09-01') & (df_500['timestamp'] < '2021-09-01')]
df_500_test = df_500.loc[df_500['timestamp'] > '2021-09-01']

print(df_500_train[['year','month','day']])
# print(df_500_val[['year','month','day']])
print(df_500_test[['year','month','day']])

df_500_train.drop(columns='timestamp', inplace=True)
# df_500_val.drop(columns='timestamp', inplace=True)
df_500_test.drop(columns='timestamp', inplace=True)

df_500_train

        year  month  day
6108    2013      5    3
6109    2013      5   10
6110    2013      5   17
6111    2013      5   24
6112    2013      5   31
...      ...    ...  ...
241219  2021      7   30
241220  2021      8    6
241221  2021      8   13
241222  2021      8   20
241223  2021      8   27

[194010 rows x 3 columns]
        year  month  day
6543    2021      9    3
6544    2021      9   10
6545    2021      9   17
6546    2021      9   24
6547    2021     10    1
...      ...    ...  ...
241293  2022     12   30
241294  2023      1    6
241295  2023      1   13
241296  2023      1   20
241297  2023      1   27

[33004 rows x 3 columns]


Unnamed: 0,ticker,open,high,low,close,volume,return,SPX,year,month,day
6108,0,0.053435,0.054317,0.054616,0.049023,0.012817,0.006053,0.006929,2013,5,3
6109,0,0.052060,0.063289,0.057684,0.063385,0.012352,0.050060,0.013004,2013,5,10
6110,0,0.065745,0.081510,0.066955,0.076711,0.021839,0.044236,0.023644,2013,5,17
6111,0,0.079843,0.088295,0.084208,0.076918,0.016045,0.000659,0.018013,2013,5,24
6112,0,0.084038,0.085375,0.085637,0.075951,0.012223,-0.003071,0.012071,2013,5,31
...,...,...,...,...,...,...,...,...,...,...,...
241219,445,0.806493,0.802259,0.800368,0.808329,0.005785,-0.004372,0.883129,2021,7,30
241220,445,0.810523,0.808291,0.779153,0.804526,0.007388,-0.004045,0.896129,2021,8,6
241221,445,0.805382,0.789968,0.785433,0.802671,0.004203,-0.001981,0.906048,2021,8,13
241222,445,0.801862,0.814005,0.804369,0.828503,0.004179,0.027645,0.897752,2021,8,20


In [23]:
def inputfor3D(df, train=True, test=False):
    df_out = df.copy()  # 입력된 df를 복사해서 사용
    
    if train: # input train은 output train의 누락에 맞춰 기업 당 마지막 12주 삭제
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples-기업수, seq_len-시계열수(총 주 수), num_features-피처수)
        X_df_train = np.stack(data, axis=0)
        return X_df_train

    elif test: # input test는 output test의 누락에 맞춰 
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples-기업수, seq_len-시계열수(총 주 수), num_features-피처수)
        X_df_test = np.stack(data, axis=0)
        return X_df_test


def outputfor3D(df, train=True, test=False):
    df_out = df.copy()  # 입력된 df를 복사해서 사용
    
    if train:
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            for j in range(0, 13):
                locals()[f'df_{i}'][f'future_{j}'] = locals()[f'df_{i}']['return'].shift(-j)
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            locals()[f'df_{i}'] = locals()[f'df_{i}'][[f'future_{k}' for k in range(0, 13)]]
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples, seq_len, num_features)
        y_df_train = np.stack(data, axis=0)
        return y_df_train

    elif test:
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            for j in range(0, 13):
                locals()[f'df_{i}'][f'future_{j}'] = locals()[f'df_{i}']['return'].shift(-j)
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            locals()[f'df_{i}'] = locals()[f'df_{i}'][[f'future_{k}' for k in range(0, 13)]]
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples, seq_len, num_features)
        y_df_test = np.stack(data, axis=0)
        return y_df_test

In [24]:
# 2분 걸림

set_train_X = inputfor3D(df_500_train, train=True, test=False)
print(set_train_X.shape)

# set_val_X = inputfor3D(df_500_val, train=False, test=True)
# print(set_val_X.shape)

set_test_X = inputfor3D(df_500_test, train=False, test=True)
print(set_test_X.shape)

(446, 423, 11)
(446, 62, 11)


In [25]:
# 12분 걸림

set_train_y = outputfor3D(df_500_train, train=True, test=False)
print(set_train_y.shape)

# set_val_y = outputfor3D(df_500_val, train=False, test=True) 
# print(set_val_y.shape)

set_test_y = outputfor3D(df_500_test, train=False, test=True)
print(set_test_y.shape)

(446, 423, 13)
(446, 62, 13)


In [26]:
# torch에 train 데이터 올리기

# train X
train_X_t = torch.tensor(set_train_X, dtype=torch.float32)
print(train_X_t.size())

# train y
train_y_t = torch.tensor(set_train_y, dtype=torch.float32)
print(train_y_t.size())


torch.Size([446, 423, 11])
torch.Size([446, 423, 13])


In [27]:
# torch에 val 데이터 올리기

# # train X
# val_X_t = torch.tensor(set_val_X, dtype=torch.float32)
# print(val_X_t.size())

# # train y
# val_y_t = torch.tensor(set_val_y, dtype=torch.float32)
# print(val_y_t.size())

In [28]:
# torch에 test 데이터 올리기

# train X
test_X_t = torch.tensor(set_test_X, dtype=torch.float32)
print(test_X_t.size())

# train y
test_y_t = torch.tensor(set_test_y, dtype=torch.float32)
print(test_y_t.size())

torch.Size([446, 62, 11])
torch.Size([446, 62, 13])


In [29]:
# Build model
#####################
input_dim = 11
hidden_dim = 32
num_layers = 2
output_dim = 13


class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(BiLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim)

        out, _ = self.bilstm(x, (h0, c0))
        out = self.dropout(out)
        out = self.fc(out)

        return out

    
model = BiLSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)

loss_fn = torch.nn.MSELoss()

optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
print(model)
print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())


BiLSTM(
  (bilstm): LSTM(11, 32, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=64, out_features=13, bias=True)
)
18
torch.Size([128, 11])
torch.Size([128, 32])
torch.Size([128])
torch.Size([128])
torch.Size([128, 11])
torch.Size([128, 32])
torch.Size([128])
torch.Size([128])
torch.Size([128, 64])
torch.Size([128, 32])
torch.Size([128])
torch.Size([128])
torch.Size([128, 64])
torch.Size([128, 32])
torch.Size([128])
torch.Size([128])
torch.Size([13, 64])
torch.Size([13])


In [30]:
#train model
num_epochs = 80
hist = np.zeros(num_epochs)

# Number of steps to unroll
look_back = 13
seq_dim =look_back-1  

n_features = train_X_t.shape[-1]

# # Reshape the data into appropriate shape for BLSTM
# X_train = train_X_t.reshape(len(train_X_t), look_back-1, n_features)
# y_train = train_y_t.reshape(-1, 1, train_y_t.shape[-1])
# X_test = test_X_t.reshape(-1, look_back-1, n_features)
# y_test = test_y_t.reshape(-1, 1, test_y_t.shape[-1])

for t in range(num_epochs):
    # Forward pass
    y_train_pred = model(train_X_t)

    loss = loss_fn(y_train_pred, train_y_t)
    if t % 10 == 0 and t !=0:
        print("Epoch ", t, "MSE: ", loss.item())
    hist[t] = loss.item()

    # Zero out gradient, else they will accumulate between epochs
    optimiser.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters
    optimiser.step()
    
    # Print the loss
    if t % 10 == 0:
        print(f"Epoch {t}, Train Loss: {loss.item():.4f}")

Epoch 0, Train Loss: 0.0311
Epoch  10 MSE:  0.0029053781181573868
Epoch 10, Train Loss: 0.0029
Epoch  20 MSE:  0.002409914508461952
Epoch 20, Train Loss: 0.0024
Epoch  30 MSE:  0.002209883648902178
Epoch 30, Train Loss: 0.0022
Epoch  40 MSE:  0.002123946323990822
Epoch 40, Train Loss: 0.0021
Epoch  50 MSE:  0.0020974422805011272
Epoch 50, Train Loss: 0.0021
Epoch  60 MSE:  0.0020819383207708597
Epoch 60, Train Loss: 0.0021
Epoch  70 MSE:  0.0020730781834572554
Epoch 70, Train Loss: 0.0021


In [31]:
# Save model
torch.save(model.state_dict(), './trainedmodel/model_biLSTM_withoutGPT.pt')

# Load model
model_biLSTM = BiLSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
model_biLSTM.load_state_dict(torch.load('./trainedmodel/model_biLSTM_withoutGPT.pt'))

<All keys matched successfully>

In [32]:
def calculate_rmse(actual_values, predicted_values):
    mse = torch.mean((actual_values - predicted_values) ** 2)
    rmse = torch.sqrt(mse)
    return rmse

In [33]:
# Evaluate loaded model
with torch.no_grad():
    prediction = model_biLSTM(test_X_t)
    test_rmse = calculate_rmse(test_y_t, prediction)
    print(f"Test RMSE: {test_rmse.item():.4f}")

Test RMSE: 0.0516


In [34]:
def torch2df(input_torch):
    df_all = pd.DataFrame(columns=["ticker"] + [f"future_{i}" for i in range(13)])

    for company_idx in range(input_torch.shape[0]):
        company_data = input_torch[company_idx]
        company_df = pd.DataFrame(company_data.numpy(), columns=[f"future_{i}" for i in range(13)])
        company_df.insert(0, 'ticker', inverse_encoded_dict[company_idx])
        # company_df['ticker'] = label_encoder.inverse_transform(df_500['ticker'])
        df_all = pd.concat([df_all, company_df], axis=0, ignore_index=False)

    return df_all

In [35]:
BiLSTM_withoutGPT_result_notime = torch2df(prediction)

In [None]:
# timestamp 파일 불러오기
timestamp_train = pd.read_csv('./result/timestamp_train.csv')
timestamp_test = pd.read_csv('./result/timestamp_test.csv')

# 열 삽입할 위치 지정
insert_index = 1

# 삽입할 열 선택
column_to_insert = timestamp_test['timestamp']

# 열 삽입
BiLSTM_withoutGPT_result = BiLSTM_withoutGPT_result_notime.copy()
BiLSTM_withoutGPT_result.insert(insert_index, 'timestamp', column_to_insert)

# 결과 출력
print(BiLSTM_withoutGPT_result)

In [41]:
BiLSTM_withoutGPT_result.to_csv('./result/BiLSTM_withoutGPT.csv', index=False)

In [42]:
# graph
BiLSTM_withoutGPT = pd.read_csv('./result/BiLSTM_withoutGPT.csv')
Test_y = pd.read_csv('./result/Test_y.csv')

In [43]:
def plot_rpast_rfuture(predict, test, ticker, timestamp): # timestamp의 경우 형식은 YYYY.MM.DD, ticker, timestamp는 스트링
    # 
    predict_row = predict.loc[(predict['ticker'] == ticker) & (predict['timestamp'] == timestamp)]
    predict_values = predict_row[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values[0]

    test_future_row = test.loc[(test['ticker'] == ticker) & (test['timestamp'] == timestamp)]
    test_future_values = test_future_row[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values[0]
    
    # row 인덱스
    index = test.loc[(test['ticker'] == ticker) & (test['timestamp'] == timestamp)].index[0]
    # make sure we don't go below 0
    if index >= 12:
        test_past_rows = test.iloc[index - 12]
    else:
        print("Not enough previous data.")
    test_past_values = test_past_rows[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values # 수정된 부분

    # 리스트로 만들기
    predict_values_list = predict_values.tolist()
    test_future_values_list = test_future_values.tolist()
    test_past_values_list = test_past_values.tolist()
    
    # 객체 생성
    fig = go.Figure()

    # 트레이스 생성
    trace_past = go.Scatter(
        x = list(range(-12, 1)),
        y = test_past_values_list[-13:],  # 수정된 부분
        mode = 'lines',
        name = 'Past'
    )

    trace_predict = go.Scatter(
        x = list(range(0, 13)),
        y = predict_values_list,
        mode = 'lines',
        name = 'Predict'
    )

    trace_future = go.Scatter(
        x = list(range(0, 13)),
        y = test_future_values_list,
        mode = 'lines',
        name = 'Actual Future'
    )

    # 객체에 트레이스 삽입
    fig.add_trace(trace_past)
    fig.add_trace(trace_predict)
    fig.add_trace(trace_future)


    # 레이아웃
    fig.update_layout(
        title = "Past, Predicted, and Actual Future Return",
        xaxis_title = "Time",
        yaxis_title = "Return",
        legend_title = "Legend",
        font = dict(
            family = "Courier New, monospace",
            size = 18,
            color = "RebeccaPurple"
        )
    )

    # 그리기
    fig.show()

In [54]:
plot_rpast_rfuture(BiLSTM_withoutGPT, Test_y, 'AAPL', '2021-11-26')

In [52]:
plot_rpast_rfuture(BiLSTM_withoutGPT, Test_y, 'AMZN', '2021-11-26')