In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

from scipy.stats import norm

import math

from sklearn.preprocessing import MinMaxScaler

import plotly.graph_objects as go

In [2]:
# 리스트로 들어간 feature 삭제 및 5개 기업 선택
df = pd.read_csv('dataset-evalPRPR-SPX-nonan.csv')

In [3]:
drop_features = ['P','P_future','R','R_future','company', 'calculated_price']
df_drop = df.drop(columns=drop_features)
# df_5 = df_drop[df['ticker'].isin(['MMM', 'VRTX', 'MRK', 'MDT', 'MCK'])] # 5개 기업의 데이터만
df_500 = df_drop.copy()


In [4]:
pd.set_option('display.max_rows', 60)
df_500['ticker'].value_counts()

ticker_counts = df_500['ticker'].value_counts()
ticker_list = ticker_counts[ticker_counts == 509].index.tolist()
df_500 = df_500[df_500['ticker'].isin(ticker_list)]
df_500 = df_500.sort_values(['ticker', 'timestamp'])
df_500 # 총 주 수가 509개인 446개 기업만 선택

Unnamed: 0,ticker,timestamp,financial stability and liquidity,strong management team,competitive advantage,market potential,growth prospects,diversification within the company,sustainable business model,innovation and R&D,corporate governance,strong brand recognition,open,high,low,close,volume,return,SPX
6108,A,2013-05-03,80.0,90.0,85.0,85.0,85.0,70.0,90.0,95.0,90.0,85.0,41.64,42.5500,40.950,41.55,16896400.0,0.006053,1614.42
6109,A,2013-05-10,80.0,90.0,85.0,85.0,85.0,70.0,90.0,95.0,90.0,85.0,41.44,43.8500,41.390,43.63,16284100.0,0.050060,1633.70
6110,A,2013-05-17,80.0,90.0,85.0,85.0,85.0,70.0,90.0,95.0,90.0,85.0,43.43,46.4900,42.720,45.56,28768000.0,0.044236,1667.47
6111,A,2013-05-24,80.0,90.0,85.0,85.0,85.0,70.0,90.0,95.0,90.0,85.0,45.48,47.4730,45.195,45.59,21143400.0,0.000659,1649.60
6112,A,2013-05-31,80.0,90.0,85.0,85.0,85.0,70.0,90.0,95.0,90.0,85.0,46.09,47.0500,45.400,45.45,16114400.0,-0.003071,1630.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241293,ZTS,2022-12-30,90.0,95.0,90.0,85.0,85.0,80.0,90.0,80.0,90.0,85.0,145.91,148.5100,143.570,146.55,4950165.0,0.005420,3839.50
241294,ZTS,2023-01-06,90.0,95.0,90.0,85.0,85.0,80.0,90.0,80.0,90.0,85.0,148.66,150.8144,141.520,147.64,6958752.0,0.007438,3895.08
241295,ZTS,2023-01-13,90.0,95.0,90.0,85.0,85.0,80.0,90.0,80.0,90.0,85.0,149.00,161.4000,147.000,160.92,12618342.0,0.089949,3999.09
241296,ZTS,2023-01-20,90.0,95.0,90.0,85.0,85.0,80.0,90.0,80.0,90.0,85.0,160.69,164.9400,158.710,163.81,13851912.0,0.017959,3972.61


In [5]:
# 정규화 
## GPT점수는 100점만점 척도이므로 기업별로 다르지 않음 따라서 통합 정규화
## open, high, low, close은 주가관련인데, 이는 기업별로 다르므로 기업별로 정규화 필요
## SPX은 기업별로 다르지 않음 따라서 통합 정규화
## volume은 기업별로 다르지만 통합 정규화가 올바름
## return은 따로 정규화할 필요는 없을거 같음
## 정규화는 편의를 위해 MinMaxScaler만 사용

scaler = MinMaxScaler()
unique_tickers = df_500['ticker'].unique()

columns_list = df_500.columns.tolist()
print(columns_list)
total_list = columns_list[2:12] + [columns_list[-1]] + [columns_list[-3]]
print(total_list)
seperate_list = columns_list[12:16]
print(seperate_list)

# 통합 정규화 : spx, gpt, volume
for i in total_list:
    df_500[i] = scaler.fit_transform(df_500[i].values.reshape(-1, 1))

# 개별기업별 정규화 : open, high, low, close	
for i in unique_tickers:
    for j in seperate_list:
        mask = (df_500['ticker'] == i)
        df_500.loc[mask, j] = scaler.fit_transform(df_500.loc[mask, j].values.reshape(-1, 1))

df_500

['ticker', 'timestamp', 'financial stability and liquidity', 'strong management team', 'competitive advantage', 'market potential', 'growth prospects', 'diversification within the company', 'sustainable business model', 'innovation and R&D', 'corporate governance', 'strong brand recognition', 'open', 'high', 'low', 'close', 'volume', 'return', 'SPX']
['financial stability and liquidity', 'strong management team', 'competitive advantage', 'market potential', 'growth prospects', 'diversification within the company', 'sustainable business model', 'innovation and R&D', 'corporate governance', 'strong brand recognition', 'SPX', 'volume']
['open', 'high', 'low', 'close']


Unnamed: 0,ticker,timestamp,financial stability and liquidity,strong management team,competitive advantage,market potential,growth prospects,diversification within the company,sustainable business model,innovation and R&D,corporate governance,strong brand recognition,open,high,low,close,volume,return,SPX
6108,A,2013-05-03,0.571429,0.714286,0.7,0.714286,0.647059,0.500000,0.909091,0.928571,0.869565,0.625,0.053435,0.054317,0.054616,0.049023,0.012817,0.006053,0.006929
6109,A,2013-05-10,0.571429,0.714286,0.7,0.714286,0.647059,0.500000,0.909091,0.928571,0.869565,0.625,0.052060,0.063289,0.057684,0.063385,0.012352,0.050060,0.013004
6110,A,2013-05-17,0.571429,0.714286,0.7,0.714286,0.647059,0.500000,0.909091,0.928571,0.869565,0.625,0.065745,0.081510,0.066955,0.076711,0.021839,0.044236,0.023644
6111,A,2013-05-24,0.571429,0.714286,0.7,0.714286,0.647059,0.500000,0.909091,0.928571,0.869565,0.625,0.079843,0.088295,0.084208,0.076918,0.016045,0.000659,0.018013
6112,A,2013-05-31,0.571429,0.714286,0.7,0.714286,0.647059,0.500000,0.909091,0.928571,0.869565,0.625,0.084038,0.085375,0.085637,0.075951,0.012223,-0.003071,0.012071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241293,ZTS,2022-12-30,0.857143,0.892857,0.8,0.714286,0.647059,0.666667,0.909091,0.714286,0.869565,0.625,0.543884,0.543018,0.537044,0.547929,0.003738,0.005420,0.708017
241294,ZTS,2023-01-06,0.857143,0.892857,0.8,0.714286,0.647059,0.666667,0.909091,0.714286,0.869565,0.625,0.556621,0.553469,0.527507,0.552984,0.005265,0.007438,0.725530
241295,ZTS,2023-01-13,0.857143,0.892857,0.8,0.714286,0.647059,0.666667,0.909091,0.714286,0.869565,0.625,0.558196,0.601479,0.553002,0.614571,0.009566,0.089949,0.758302
241296,ZTS,2023-01-20,0.857143,0.892857,0.8,0.714286,0.647059,0.666667,0.909091,0.714286,0.869565,0.625,0.612338,0.617534,0.607481,0.627974,0.010503,0.017959,0.749958


In [None]:
# 라벨 인코더 생성
label_encoder = LabelEncoder()

# 라벨 인코딩 딕셔너리 저장
encoded_labels = label_encoder.fit_transform(df_500['ticker'])
encoded_dict = dict(zip(df_500['ticker'], encoded_labels))
inverse_encoded_dict = {value: key for key, value in encoded_dict.items()}

# ticker 열 라벨 인코딩
df_500['ticker'] = label_encoder.fit_transform(df_500['ticker'])

print(df_500[['ticker']])
df_500.info()
df_500['ticker'].value_counts()

In [None]:
inverse_encoded_dict

In [None]:
# 전체 기업 데이터를 이용할 것이기 때문에 날짜 자체가 입력데이터로 들어가야 구분됨
# 'date' 열을 datetime 타입으로 변환
# train / test 시계열 분리

df_500['timestamp'] = pd.to_datetime(df_500['timestamp'])

# 'year', 'month', 'day' 열 추출
df_500['year'] = df_500['timestamp'].dt.year
df_500['month'] = df_500['timestamp'].dt.month
df_500['day'] = df_500['timestamp'].dt.day

df_500_train = df_500.loc[df_500['timestamp'] < '2021-09-01']
# df_500_val = df_500.loc[(df_500['timestamp'] > '2020-09-01') & (df_500['timestamp'] < '2021-09-01')]
df_500_test = df_500.loc[df_500['timestamp'] > '2021-09-01']

print(df_500_train[['year','month','day']])
# print(df_500_val[['year','month','day']])
print(df_500_test[['year','month','day']])

df_500_train.drop(columns='timestamp', inplace=True)
# df_500_val.drop(columns='timestamp', inplace=True)
df_500_test.drop(columns='timestamp', inplace=True)

df_500_train

In [9]:
def inputfor3D(df, train=True, test=False):
    df_out = df.copy()  # 입력된 df를 복사해서 사용
    
    if train: # input train은 output train의 누락에 맞춰 기업 당 마지막 12주 삭제
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples-기업수, seq_len-시계열수(총 주 수), num_features-피처수)
        X_df_train = np.stack(data, axis=0)
        return X_df_train

    elif test: # input test는 output test의 누락에 맞춰 
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples-기업수, seq_len-시계열수(총 주 수), num_features-피처수)
        X_df_test = np.stack(data, axis=0)
        return X_df_test


def outputfor3D(df, train=True, test=False):
    df_out = df.copy()  # 입력된 df를 복사해서 사용
    
    if train:
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            for j in range(0, 13):
                locals()[f'df_{i}'][f'future_{j}'] = locals()[f'df_{i}']['return'].shift(-j)
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            locals()[f'df_{i}'] = locals()[f'df_{i}'][[f'future_{k}' for k in range(0, 13)]]
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples, seq_len, num_features)
        y_df_train = np.stack(data, axis=0)
        return y_df_train

    elif test:
        dfs=[]
        for i in range(df_out['ticker'].nunique()):
            locals()[f'df_{i}'] = df_out[df_out['ticker'] == i]
            for j in range(0, 13):
                locals()[f'df_{i}'][f'future_{j}'] = locals()[f'df_{i}']['return'].shift(-j)
            locals()[f'df_{i}'].drop(locals()[f'df_{i}'].tail(12).index, inplace=True)
            locals()[f'df_{i}'] = locals()[f'df_{i}'][[f'future_{k}' for k in range(0, 13)]]
            dfs.append(locals()[f'df_{i}'])

        # 각 기업의 feature 값을 가지고 있는 2차원 행렬을 만듦
        data = []
        for df_i in dfs:
            data.append(df_i.values) # 일단 티커도 입력데이터에 포함함

        # 시계열 축을 기준으로 3차원으로 구성된 입력 데이터 생성
        # shape: (num_samples, seq_len, num_features)
        y_df_test = np.stack(data, axis=0)
        return y_df_test

In [10]:
# 2분 걸림

set_train_X = inputfor3D(df_500_train, train=True, test=False)
print(set_train_X.shape)

# set_val_X = inputfor3D(df_500_val, train=False, test=True)
# print(set_val_X.shape)

set_test_X = inputfor3D(df_500_test, train=False, test=True)
print(set_test_X.shape)

(446, 423, 21)
(446, 62, 21)


In [11]:
# 12분 걸림

set_train_y = outputfor3D(df_500_train, train=True, test=False)
print(set_train_y.shape)

# set_val_y = outputfor3D(df_500_val, train=False, test=True) 
# print(set_val_y.shape)

set_test_y = outputfor3D(df_500_test, train=False, test=True)
print(set_test_y.shape)

(446, 423, 13)
(446, 62, 13)


In [12]:
# torch에 train 데이터 올리기

# train X
train_X_t = torch.tensor(set_train_X, dtype=torch.float32)
print(train_X_t.size())

# train y
train_y_t = torch.tensor(set_train_y, dtype=torch.float32)
print(train_y_t.size())


torch.Size([446, 423, 21])
torch.Size([446, 423, 13])


In [13]:
# # torch에 val 데이터 올리기

# # train X
# val_X_t = torch.tensor(set_val_X, dtype=torch.float32)
# print(val_X_t.size())

# # train y
# val_y_t = torch.tensor(set_val_y, dtype=torch.float32)
# print(val_y_t.size())

In [14]:
# torch에 test 데이터 올리기

# train X
test_X_t = torch.tensor(set_test_X, dtype=torch.float32)
print(test_X_t.size())

# train y
test_y_t = torch.tensor(set_test_y, dtype=torch.float32)
print(test_y_t.size())

torch.Size([446, 62, 21])
torch.Size([446, 62, 13])


In [15]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        div_term = div_term.repeat(max_len, 1) # div_term을 max_len x (d_model // 2) 크기로 확장
        pos = torch.zeros(max_len, d_model) # pos 텐서 초기화
        pos[:, 0::2] = position * div_term
        pos[:, 1::2] = position * div_term[:, :d_model//2]
        pos = pos.unsqueeze(1) # pos 텐서를 max_len x 1 x d_model 크기로 변경
        self.register_buffer('pos', pos)

    def forward(self, x):
        x = x + self.pos[:x.size(0), :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, output_size):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers)
        self.decoder = nn.Linear(d_model, output_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

# 하이퍼파라미터 설정
output_size = 13
d_model = 21 # 입력 텐서의 마지막 차원과 일치하도록 설정
nhead = 3 # d_model의 약수 중 하나
num_layers = 3
dim_feedforward = 256

# 모델 생성
model = TransformerModel(d_model, nhead, num_layers, dim_feedforward, output_size)


# 손실 함수 및 옵티마이저 설정
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(train_X_t)
    loss = nn.MSELoss()(train_y_t, output)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/50], Loss: 0.0074
Epoch [20/50], Loss: 0.0042
Epoch [30/50], Loss: 0.0036
Epoch [40/50], Loss: 0.0033
Epoch [50/50], Loss: 0.0031


In [16]:
# 모델 저장하기
torch.save(model.state_dict(), "./trainedmodel/model_transformer.pth")

In [17]:
# 모델 불러오기
loaded_model = TransformerModel(d_model, nhead, num_layers, dim_feedforward, output_size)
loaded_model.load_state_dict(torch.load("./trainedmodel/model_transformer.pth"))

# 모델을 평가 모드로 설정하기
loaded_model.eval()

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=21, out_features=21, bias=True)
    )
    (linear1): Linear(in_features=21, out_features=256, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=256, out_features=21, bias=True)
    (norm1): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((21,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=21, out_features=21, bias=True)
        )
        (linear1): Linear(in_featu

In [18]:
# 예측
with torch.no_grad():
    prediction = loaded_model(test_X_t)

In [19]:
def calculate_rmse(actual_values, predicted_values):
    mse = torch.mean((actual_values - predicted_values) ** 2)
    rmse = torch.sqrt(mse)
    return rmse

In [20]:
# Evaluate loaded model
with torch.no_grad():
    test_rmse = calculate_rmse(test_y_t, prediction)
    print(f"Test RMSE: {test_rmse.item():.4f}")

Test RMSE: 0.0604


In [21]:
def torch2df(input_torch):
    df_all = pd.DataFrame(columns=["ticker"] + [f"future_{i}" for i in range(13)])

    for company_idx in range(input_torch.shape[0]):
        company_data = input_torch[company_idx]
        company_df = pd.DataFrame(company_data.numpy(), columns=[f"future_{i}" for i in range(13)])
        company_df.insert(0, 'ticker', inverse_encoded_dict[company_idx])
        # company_df['ticker'] = label_encoder.inverse_transform(df_500['ticker'])
        df_all = pd.concat([df_all, company_df], axis=0, ignore_index=False)

    return df_all


In [22]:
Transformer_result_notime = torch2df(prediction)

In [None]:
# timestamp 파일 불러오기
timestamp_train = pd.read_csv('./result/timestamp_train.csv')
timestamp_test = pd.read_csv('./result/timestamp_test.csv')

# 열 삽입할 위치 지정
insert_index = 1

# 삽입할 열 선택
column_to_insert = timestamp_test['timestamp']

# 열 삽입
Transformer_result = Transformer_result_notime.copy()
Transformer_result.insert(insert_index, 'timestamp', column_to_insert)

# 결과 출력
print(Transformer_result)

In [28]:
Transformer_result.to_csv('./result/Transformer.csv', index=False)

In [29]:
# graph
Transformer = pd.read_csv('./result/Transformer.csv')
Test_y = pd.read_csv('./result/Test_y.csv')

In [30]:
def plot_rpast_rfuture(predict, test, ticker, timestamp): # timestamp의 경우 형식은 YYYY.MM.DD, ticker, timestamp는 스트링
    # 
    predict_row = predict.loc[(predict['ticker'] == ticker) & (predict['timestamp'] == timestamp)]
    predict_values = predict_row[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values[0]

    test_future_row = test.loc[(test['ticker'] == ticker) & (test['timestamp'] == timestamp)]
    test_future_values = test_future_row[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values[0]
    
    # row 인덱스
    index = test.loc[(test['ticker'] == ticker) & (test['timestamp'] == timestamp)].index[0]
    # make sure we don't go below 0
    if index >= 12:
        test_past_rows = test.iloc[index - 12]
    else:
        print("Not enough previous data.")
    test_past_values = test_past_rows[['future_0', 'future_1', 'future_2', 'future_3', 'future_4', 'future_5', 'future_6', 'future_7', 'future_8', 'future_9', 'future_10', 'future_11', 'future_12']].values # 수정된 부분

    # 리스트로 만들기
    predict_values_list = predict_values.tolist()
    test_future_values_list = test_future_values.tolist()
    test_past_values_list = test_past_values.tolist()
    
    # 객체 생성
    fig = go.Figure()

    # 트레이스 생성
    trace_past = go.Scatter(
        x = list(range(-12, 1)),
        y = test_past_values_list[-13:],  # 수정된 부분
        mode = 'lines',
        name = 'Past'
    )

    trace_predict = go.Scatter(
        x = list(range(0, 13)),
        y = predict_values_list,
        mode = 'lines',
        name = 'Predict'
    )

    trace_future = go.Scatter(
        x = list(range(0, 13)),
        y = test_future_values_list,
        mode = 'lines',
        name = 'Actual Future'
    )

    # 객체에 트레이스 삽입
    fig.add_trace(trace_past)
    fig.add_trace(trace_predict)
    fig.add_trace(trace_future)


    # 레이아웃
    fig.update_layout(
        title = "Past, Predicted, and Actual Future Return",
        xaxis_title = "Time",
        yaxis_title = "Return",
        legend_title = "Legend",
        font = dict(
            family = "Courier New, monospace",
            size = 18,
            color = "RebeccaPurple"
        )
    )

    # 그리기
    fig.show()

In [31]:
plot_rpast_rfuture(Transformer, Test_y, 'AAPL', '2021-11-26')

In [32]:
plot_rpast_rfuture(Transformer, Test_y, 'AMZN', '2021-11-26')