# Preparing Data

우리가 하려는 건 결국 주식에 대한 정보와, 과거 주가를 넣으면 미래 주가가 나오는 모델을 만드는 것이 목표. <br>
그러면 시기가 끊기는 것은 중요하지 않으므로, NaN 값이 없는 데이터셋을 사용

## Data 불러오기 (colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset-evalPRPR-SPX-nonan (2).csv')

## Data 불러오기 (vscode)

# Data Preprocessing

In [None]:
df_ = df.copy()

## 모든 timestamp 정보를 가진 기업만 추리기

In [None]:
# 우리가 가지고 있는 모든 timestamp(509개)의 정보를 가진 기업만 다룰 예정
# 지워야 하는 기업의 리스트
delete_ticker_list = []

# for문으로 509개의 정보를 가지지 않은 기업 delete_ticker_list에 저장
for t in df['ticker'].unique():
  if not len(df.loc[df['ticker'] == t, 'timestamp'].values) == 509:
      delete_ticker_list.append(t)

# delete_ticker_list에 들어있지 않은 기업들만 데려감
for ticker in delete_ticker_list:
    df = df[df['ticker'] != ticker]

df.columns

Index(['ticker', 'company', 'timestamp', 'financial stability and liquidity',
       'strong management team', 'competitive advantage', 'market potential',
       'growth prospects', 'diversification within the company',
       'sustainable business model', 'innovation and R&D',
       'corporate governance', 'strong brand recognition', 'open', 'high',
       'low', 'close', 'volume', 'return', 'SPX', 'calculated_price', 'P',
       'P_future', 'R', 'R_future'],
      dtype='object')

## 정규화

In [None]:
# 정규화 
## GPT점수는 100점만점 척도이므로 기업별로 다르지 않음 따라서 통합 정규화
## open, high, low, close은 주가관련인데, 이는 기업별로 다르므로 기업별로 정규화 필요
## SPX은 기업별로 다르지 않음 따라서 통합 정규화
## volume은 기업별로 다르지만 통합 정규화가 올바름
## return은 따로 정규화할 필요는 없을거 같음
## 정규화는 편의를 위해 MinMaxScaler만 사용
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
unique_tickers = df['ticker'].unique()

columns_lists = df.columns.to_list()
columns_list = []
for c in columns_lists:
    if c not in ['company', 'P', 'P_future', 'R', 'R_future']:
        columns_list.append(c)
print(columns_list)
total_list = columns_list[2:12] + [columns_list[-1]] + [columns_list[-3]]
print(total_list)
seperate_list = columns_list[12:16]
print(seperate_list)

# 통합 정규화 : spx, gpt, volume
for i in total_list:
    df[i] = scaler.fit_transform(df[i].values.reshape(-1, 1))

# 개별기업별 정규화 : open, high, low, close	
for i in unique_tickers:
    for j in seperate_list:
        mask = (df['ticker'] == i)
        df.loc[mask, j] = scaler.fit_transform(df.loc[mask, j].values.reshape(-1, 1))

['ticker', 'timestamp', 'financial stability and liquidity', 'strong management team', 'competitive advantage', 'market potential', 'growth prospects', 'diversification within the company', 'sustainable business model', 'innovation and R&D', 'corporate governance', 'strong brand recognition', 'open', 'high', 'low', 'close', 'volume', 'return', 'SPX', 'calculated_price']
['financial stability and liquidity', 'strong management team', 'competitive advantage', 'market potential', 'growth prospects', 'diversification within the company', 'sustainable business model', 'innovation and R&D', 'corporate governance', 'strong brand recognition', 'calculated_price', 'return']
['open', 'high', 'low', 'close']


## sequential 정보 처리

### 'timestamp' => 'year', 'month', 'day'

In [None]:
# 'date' 열을 datetime 타입으로 변환
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 'year', 'month', 'day' 열 추출
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

df.columns

Index(['ticker', 'company', 'timestamp', 'financial stability and liquidity',
       'strong management team', 'competitive advantage', 'market potential',
       'growth prospects', 'diversification within the company',
       'sustainable business model', 'innovation and R&D',
       'corporate governance', 'strong brand recognition', 'open', 'high',
       'low', 'close', 'volume', 'return', 'SPX', 'calculated_price', 'P',
       'P_future', 'R', 'R_future', 'year', 'month', 'day'],
      dtype='object')

### P, R to each columns

In [None]:
# P, P_future, R, R_future는 string을 value로 가지므로, 우리의 원 목적에 맞게 list로 형변환 진행

import ast

df['P'] = df['P'].apply(ast.literal_eval)
df['P_future'] = df['P_future'].apply(ast.literal_eval)
df['R'] = df['R'].apply(ast.literal_eval)
df['R_future'] = df['R_future'].apply(ast.literal_eval)

In [None]:
# sliding window의 효과를 위해 column명 작업 (새롭게 추가하고 싶은 column명을 리스트로 만들기)
# log('P') 준비도 한 번에
l = 13

lst = [i for i in range(-l+1, l+1)]
Pcol = []
Rcol= []

for j in ['logP', 'R']:
  for i in range(2*l):
    if lst[i] < 0:
      col_name = j + "_t" + str(lst[i])
      if j == 'logP': Pcol.append(col_name)
      else: Rcol.append(col_name)
    elif lst[i] == 0:
      col_name = j + "_t"
      if j == 'logP': Pcol.append(col_name)
      else: Rcol.append(col_name)
    else:
      col_name = j + "_t+" + str(lst[i])
      if j == 'logP': Pcol.append(col_name)
      else: Rcol.append(col_name)

print(Pcol)
print(Rcol)

['logP_t-12', 'logP_t-11', 'logP_t-10', 'logP_t-9', 'logP_t-8', 'logP_t-7', 'logP_t-6', 'logP_t-5', 'logP_t-4', 'logP_t-3', 'logP_t-2', 'logP_t-1', 'logP_t', 'logP_t+1', 'logP_t+2', 'logP_t+3', 'logP_t+4', 'logP_t+5', 'logP_t+6', 'logP_t+7', 'logP_t+8', 'logP_t+9', 'logP_t+10', 'logP_t+11', 'logP_t+12', 'logP_t+13']
['R_t-12', 'R_t-11', 'R_t-10', 'R_t-9', 'R_t-8', 'R_t-7', 'R_t-6', 'R_t-5', 'R_t-4', 'R_t-3', 'R_t-2', 'R_t-1', 'R_t', 'R_t+1', 'R_t+2', 'R_t+3', 'R_t+4', 'R_t+5', 'R_t+6', 'R_t+7', 'R_t+8', 'R_t+9', 'R_t+10', 'R_t+11', 'R_t+12', 'R_t+13']


In [None]:
# P, P_future, R, R_future 리스트에 있던 값을 각각 분리해서 기존 데이터 프레임에 추가
df[Pcol[:l]] = pd.DataFrame(df['P'].tolist(), index=df.index)
df[Pcol[l:]] = pd.DataFrame(df['P_future'].tolist(), index=df.index)
df[Rcol[:l]] = pd.DataFrame(df['R'].tolist(), index=df.index)
df[Rcol[l:]] = pd.DataFrame(df['R_future'].tolist(), index=df.index)

# 기존 데이터와 t+13 columns drop
df = df.drop(labels="logP_t+"+str(l), axis=1)
df = df.drop(labels="R_t+"+str(l), axis=1)
df = df.drop(labels=['P', 'P_future', 'R', 'R_future'], axis=1)

df.columns

Index(['ticker', 'company', 'timestamp', 'financial stability and liquidity',
       'strong management team', 'competitive advantage', 'market potential',
       'growth prospects', 'diversification within the company',
       'sustainable business model', 'innovation and R&D',
       'corporate governance', 'strong brand recognition', 'open', 'high',
       'low', 'close', 'volume', 'return', 'SPX', 'calculated_price', 'year',
       'month', 'day', 'logP_t-12', 'logP_t-11', 'logP_t-10', 'logP_t-9',
       'logP_t-8', 'logP_t-7', 'logP_t-6', 'logP_t-5', 'logP_t-4', 'logP_t-3',
       'logP_t-2', 'logP_t-1', 'logP_t', 'logP_t+1', 'logP_t+2', 'logP_t+3',
       'logP_t+4', 'logP_t+5', 'logP_t+6', 'logP_t+7', 'logP_t+8', 'logP_t+9',
       'logP_t+10', 'logP_t+11', 'logP_t+12', 'R_t-12', 'R_t-11', 'R_t-10',
       'R_t-9', 'R_t-8', 'R_t-7', 'R_t-6', 'R_t-5', 'R_t-4', 'R_t-3', 'R_t-2',
       'R_t-1', 'R_t', 'R_t+1', 'R_t+2', 'R_t+3', 'R_t+4', 'R_t+5', 'R_t+6',
       'R_t+7', 'R_t+8', '

## log

In [None]:
import math

# P값들에 대해 log 취하기
for c in [s for s in df.columns if s.find('log')==0]:
  df[c] = df[c].apply(lambda x: math.log(x))

# volume에 대해 log 취하기
df['volume'] = df['volume'].apply(lambda x: math.log(x))

df['volume']

0         16.592091
1         16.492710
2         16.467646
3         16.422679
4         16.180366
            ...    
241293    15.414931
241294    15.755511
241295    16.350662
241296    16.443934
241297    16.143133
Name: volume, Length: 227014, dtype: float64

## categorical 변수 처리

In [None]:
# categorical 변수를 int형으로 변경하는 class define

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

class MultiColLabelEncoder:
    def __init__(self):
        self.encoder_dict = defaultdict(LabelEncoder)

    def fit_transform(self, X: pd.DataFrame, columns: list):
        if not isinstance(columns, list):
            columns = [columns]

        output = X.copy()
        output[columns] = X[columns].apply(lambda x: self.encoder_dict[x.name].fit_transform(x))

        return output

    def inverse_transform(self, X: pd.DataFrame, columns: list):
        if not isinstance(columns, list):
            columns = [columns]

        if not all(key in self.encoder_dict for key in columns):
            raise KeyError(f"at least one of {columns} is not encoded before")
        output = X.copy()
        try:
            output[columns] = X[columns].apply(lambda x: self.encoder_dict[x.name].inverse_transform(x))
        except ValueError:
            print(f"Need assignment for 'fit_transform' function")
            raise

        return output

In [None]:
# categorical 변수에는 무엇이 있는지 확인
features = df.columns
cat_cols = list(df.select_dtypes(include=['category', 'object']).columns)
cat_cols

['ticker', 'company']

In [None]:
# 위에서 만든 MultiColLabelEncoder를 이용해 LabelEncoding 후 확인
mcle = MultiColLabelEncoder()
df_mcle = mcle.fit_transform(df, cat_cols)

# ticker와 company는 같은 내용을 전달하므로 company는 drop한다. 
df_mcle = df_mcle.drop(labels='company', axis=1)

df_mcle['ticker'].unique()

## Train-Validation-Test set split

# timestamp로 sort
df_mcle = df_mcle.sort_values(by=['timestamp', 'ticker'], ascending=True)
df_mcle

In [None]:
# 'timestamp'를 기준으로 2020-09-01, 2021-09-01
train_set = df_mcle[df_mcle['timestamp'] < '2020-09-01']
val_set = df_mcle.loc[(df_mcle['timestamp'] > '2020-09-01') & (df_mcle['timestamp'] < '2021-09-01')]
test_set = df_mcle[df_mcle['timestamp'] > '2021-09-01']

In [None]:
print(len(train_set))
print(len(test_set))
print(len(val_set))

170818
33004
23192


In [None]:
# t+1 ~ t+12 시점이 다음 단계로 넘어가는 문제를 커버하기 위해 data set에 제한을 둠
train_set = train_set.iloc[:-12, :]
val_set = val_set.iloc[:-12, :]
test_set = test_set.iloc[:-12, :]

In [None]:
# 'timestamp'는 tabnet이 사용하지 못하므로 drop
train_set = train_set.drop(labels='timestamp', axis=1)
val_set = val_set.drop(labels='timestamp', axis=1)
test_set = test_set.drop(labels='timestamp', axis=1)

## X, y split

In [None]:
# 사용할 column만 뽑아서 X, y로 나눔
# ticker~GPTscore: 0~10
# SPX~date: 11~21
# P_t-12 ~ P_t: 22~34
# P_t ~ P_t+12: 34~46
# R_t-12 ~ R_t: 47~59
# R_t ~ R_t+12: 59~71

# pX_train = train_set.iloc[:, list(range(36))]
# py_train = train_set.iloc[:, list(range(35, 48))]
# pX_test = test_set.iloc[:, list(range(36))]
# py_test = test_set.iloc[:, list(range(35, 48))]

rX_train = train_set.iloc[:, list(range(22)) + list(range(47, 60))]
ry_train = train_set.iloc[:, list(range(59, 72))]
rX_val = val_set.iloc[:, list(range(22)) + list(range(47, 60))]
ry_val = val_set.iloc[:, list(range(59, 72))]
rX_test = test_set.iloc[:, list(range(22)) + list(range(47, 60))]
ry_test = test_set.iloc[:, list(range(59, 72))]

ry_test.columns

Index(['R_t', 'R_t+1', 'R_t+2', 'R_t+3', 'R_t+4', 'R_t+5', 'R_t+6', 'R_t+7',
       'R_t+8', 'R_t+9', 'R_t+10', 'R_t+11', 'R_t+12'],
      dtype='object')

In [None]:
ry = ry_test.iloc[:, -13:-1]
ry.columns

Index(['R_t', 'R_t+1', 'R_t+2', 'R_t+3', 'R_t+4', 'R_t+5', 'R_t+6', 'R_t+7',
       'R_t+8', 'R_t+9', 'R_t+10', 'R_t+11'],
      dtype='object')

In [None]:
# 파라미터 다 정하고 돌릴거
final_train = df_mcle[df_mcle['timestamp'] < '2021-09-01']
final_test = df_mcle[df_mcle['timestamp'] > '2021-09-01']

final_train = final_train.iloc[:-12, :]
final_test = final_test.iloc[:-12, :]

final_train = final_train.drop(labels='timestamp', axis=1)
final_test = final_test.drop(labels='timestamp', axis=1)

final_train_X = final_train.iloc[:, list(range(22)) + list(range(47, 60))]
final_train_y = final_train.iloc[:, list(range(59, 72))]
final_test_X = final_test.iloc[:, list(range(22)) + list(range(47, 60))]
final_test_y = final_test.iloc[:, list(range(59, 72))]

In [None]:
final_test_X.iloc[:, -13:]

Unnamed: 0,R_t-12,R_t-11,R_t-10,R_t-9,R_t-8,R_t-7,R_t-6,R_t-5,R_t-4,R_t-3,R_t-2,R_t-1,R_t
435,-0.015724,-0.055273,0.016441,0.026393,0.005553,-0.008110,0.005618,-0.012719,0.000960,0.012366,-0.032007,0.004584,-0.003384
436,-0.055273,0.016441,0.026393,0.005553,-0.008110,0.005618,-0.012719,0.000960,0.012366,-0.032007,0.004584,-0.003384,-0.050620
437,0.016441,0.026393,0.005553,-0.008110,0.005618,-0.012719,0.000960,0.012366,-0.032007,0.004584,-0.003384,-0.050620,-0.016581
438,0.026393,0.005553,-0.008110,0.005618,-0.012719,0.000960,0.012366,-0.032007,0.004584,-0.003384,-0.050620,-0.016581,-0.002479
439,0.005553,-0.008110,0.005618,-0.012719,0.000960,0.012366,-0.032007,0.004584,-0.003384,-0.050620,-0.016581,-0.002479,-0.024083
...,...,...,...,...,...,...,...,...,...,...,...,...,...
241281,-0.019328,0.020855,0.024526,-0.043495,0.006071,-0.035009,-0.056807,-0.018200,0.046694,-0.042663,-0.046090,-0.011729,-0.006204
241282,0.020855,0.024526,-0.043495,0.006071,-0.035009,-0.056807,-0.018200,0.046694,-0.042663,-0.046090,-0.011729,-0.006204,-0.013368
241283,0.024526,-0.043495,0.006071,-0.035009,-0.056807,-0.018200,0.046694,-0.042663,-0.046090,-0.011729,-0.006204,-0.013368,0.012448
241284,-0.043495,0.006071,-0.035009,-0.056807,-0.018200,0.046694,-0.042663,-0.046090,-0.011729,-0.006204,-0.013368,0.012448,0.041234


In [None]:
final_test_y.iloc[:, -13:]

Unnamed: 0,R_t,R_t+1,R_t+2,R_t+3,R_t+4,R_t+5,R_t+6,R_t+7,R_t+8,R_t+9,R_t+10,R_t+11,R_t+12
435,-0.003384,-0.050620,-0.016581,-0.002479,-0.024083,0.001528,0.028200,-0.006266,-0.011726,0.017461,0.010231,-0.024121,-0.020700
436,-0.050620,-0.016581,-0.002479,-0.024083,0.001528,0.028200,-0.006266,-0.011726,0.017461,0.010231,-0.024121,-0.020700,-0.016693
437,-0.016581,-0.002479,-0.024083,0.001528,0.028200,-0.006266,-0.011726,0.017461,0.010231,-0.024121,-0.020700,-0.016693,0.026131
438,-0.002479,-0.024083,0.001528,0.028200,-0.006266,-0.011726,0.017461,0.010231,-0.024121,-0.020700,-0.016693,0.026131,-0.013269
439,-0.024083,0.001528,0.028200,-0.006266,-0.011726,0.017461,0.010231,-0.024121,-0.020700,-0.016693,0.026131,-0.013269,0.001259
...,...,...,...,...,...,...,...,...,...,...,...,...,...
241281,-0.006204,-0.013368,0.012448,0.041234,-0.127936,0.111319,-0.019589,0.030624,0.048767,-0.025600,-0.058218,0.008999,0.005420
241282,-0.013368,0.012448,0.041234,-0.127936,0.111319,-0.019589,0.030624,0.048767,-0.025600,-0.058218,0.008999,0.005420,0.007438
241283,0.012448,0.041234,-0.127936,0.111319,-0.019589,0.030624,0.048767,-0.025600,-0.058218,0.008999,0.005420,0.007438,0.089949
241284,0.041234,-0.127936,0.111319,-0.019589,0.030624,0.048767,-0.025600,-0.058218,0.008999,0.005420,0.007438,0.089949,0.017959


# Model Running

## Import, Install

In [None]:
import torch
!pip install pytorch-tabnet
!pip install torch --upgrade

## Get Model

In [None]:
import pickle

# 모델 로드
with open('TabNetmodel.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
y_pred = model.predict(final_test_X.values)

# Graph

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import os
# !pip install -q orca==1.8
!pip install -U kaleido
import plotly.io as pio
pio.renderers.default = 'colab'

In [None]:
### 세번째함수 ver2
### 기업의 ticker를 넣은 리스트를 만들어서 for문 돌리기 용
import numpy as np

def drawRGraph(X_test:pd.DataFrame, y_test_df:pd.DataFrame, y_pred:np.ndarray, df:pd.DataFrame):

  # 데이터 준비
  # X_test로부터 ticker, company 정보 꺼내기
  mcle_ticker_df = pd.DataFrame(X_test.copy().iloc[:, 0])
  mcle_ticker_lst = mcle_ticker_df.values.tolist()
  TICKER = mcle.inverse_transform(X_test, 'ticker')
  TICKER = TICKER['ticker'].values.tolist()

  # X_test로부터 'timestamp' column 다시 만들기
  test_timestamp = X_test.loc[:, ['year', 'month', 'day']]
  test_timestamp['timestamp'] = test_timestamp['year'].astype(str) + '-' + test_timestamp['month'].astype(str) + '-' + test_timestamp['day'].astype(str)
  test_timestamp['timestamp'] = pd.to_datetime(test_timestamp['timestamp'])

  # X_test, y_test, y_pred를 그래프 뽑기 편하게 만들어주기 (timestamp 붙이기)
  X_test_graph = X_test.copy()
  X_columns = list(X_test_graph.columns)
  X_test_graph = pd.concat([X_test_graph, test_timestamp['timestamp']], ignore_index=True, axis=1)
  X_test_graph.columns = X_columns + ['timestamp']

  y_test_graph = y_test_df.copy()
  y_columns = list(y_test_graph.columns)
  y_test_graph = pd.concat([y_test_graph, test_timestamp['timestamp']], ignore_index = True, axis=1)
  y_test_graph = pd.concat([y_test_graph, X_test_graph['ticker']], ignore_index = True, axis=1)
  y_test_graph.columns = y_columns+ ['timestamp', 'ticker']

  y_pred_graph = pd.DataFrame(y_pred)
  y_pred_graph = pd.concat([y_pred_graph, test_timestamp['timestamp'].reset_index(drop=True)], ignore_index = True, axis = 1)
  y_pred_graph = pd.concat([y_pred_graph, X_test_graph['ticker'].reset_index(drop=True)], ignore_index=True, axis=1)
  y_pred_graph.columns = [f"R{s}" for s in range(13)] + ['timestamp', 'ticker']

  # timelist (X of graph)
  X_timelist = X_test_graph['timestamp'].unique().tolist()
  X_timelist.sort()
  y_timelist = y_test_graph['timestamp'].unique().tolist()
  y_timelist.sort()

  # check bad prediction
  bad_prediction = 0
  # 그래프 그리기
  unique_TICKER = list(set(TICKER))
  for t in unique_TICKER:
    coIndex = TICKER.index(t)
    co_n = mcle_ticker_lst[coIndex]
    
    # 특정 ticker 골라내기
    X_temp = X_test_graph[X_test_graph['ticker'] == co_n[0]]
    y_temp = y_test_graph[y_test_graph['ticker'] == co_n[0]]
    pred_temp = y_pred_graph[y_pred_graph['ticker'] == co_n[0]]

    # sort
    X_temp = X_temp.sort_values(by='timestamp', ascending=True)
    y_temp = y_temp.sort_values(by='timestamp', ascending=True)
    pred_temp = pred_temp.sort_values(by='timestamp', ascending=True)

    # 그래프를 그릴 R만 골라내기
    X_graph = X_temp.iloc[:, -14:-1]
    y_graph = y_temp.iloc[:, -15:-2]
    pred_graph = pred_temp.iloc[:, -15:-2]

    # X dataset 마련
    Xlist = [0] * (len(X_timelist)-1)
    for i in range(len(X_graph)-l): 
      for j in range(l):
        Xlist[i+j] += X_graph.iloc[i, j]
    Xlist_real = []
    for i in range(len(Xlist)):
      if i < l:
        Xlist_real.append(Xlist[i]/(i+1))
      elif i > len(Xlist) - l:
        Xlist_real.append(Xlist[i]/(len(Xlist)-i))
      else:
        Xlist_real.append(Xlist[i]/l)

    # y dataset 마련
    ylist = [0] * (len(y_timelist)-1)
    for i in range(len(y_graph)-l): 
      for j in range(l):
        ylist[i+j] += y_graph.iloc[i, j]
    ylist_real = []
    for i in range(len(ylist)):
      if i < l:
        ylist_real.append(ylist[i]/(i+1))
      elif i > len(ylist) - l:
        ylist_real.append(ylist[i]/(len(ylist)-i))
      else:
        ylist_real.append(ylist[i]/l)

    # pred dataset 마련
    predlist = [0] * (len(y_timelist)-1)
    for i in range(len(pred_graph)-l): 
      for j in range(l):
        predlist[i+j] += pred_graph.iloc[i, j]
    predlist_real = []
    for i in range(len(predlist)):
      if i < l:
        predlist_real.append(predlist[i]/(i+1))
      elif i > len(predlist) - l:
        predlist_real.append(predlist[i]/(len(predlist)-i))
      else:
        predlist_real.append(predlist[i]/l)
    # print(predlist_real)
    if np.std(predlist_real) < 0.002:
      bad_prediction += 1
    else:
      print("good prediction company", t)

    # 그래프 그리기
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.update_layout(title_text=t, title_font_size=20)

    Xrange = [i for i in range(len(X_timelist))]
    yrange = [i + l - 1 for i in range(len(y_timelist))]
  
    # Add traces
    fig.add_trace(
        go.Scatter(x=Xrange, y=Xlist_real, name="X_test"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=yrange, y=ylist_real, name="y_test"),
        secondary_y=False,
        #secondary_y=True,
    )

    fig.add_trace(
        go.Scatter(x=yrange, y=predlist_real, name="y_pred"),
        secondary_y=False,
        #secondary_y=True,
    )

    if not os.path.exists('/content/drive/MyDrive/graphR2'):
      os.makedirs('/content/drive/MyDrive/graphR2')
    fig.write_image(f"/content/drive/MyDrive/graphR2/{t}.png", engine='kaleido')

  print("std 0.002 이하 비율")
  print(bad_prediction/len(unique_TICKER))

In [None]:
drawRGraph(final_test_X, final_test_y, y_pred, df_)

# Graph about P

In [None]:
### 함수 완성 1번
def RfromDF(df:pd.DataFrame) -> pd.DataFrame:
    """
    정규화 전 R을 나랑 같은 형태로 불러오기 위해서
    """
    import ast
    R = df.copy()
    R = R[['R', 'R_future']]
    R['R'] = R['R'].apply(ast.literal_eval)
    R['R_future'] = R['R_future'].apply(ast.literal_eval)

    l = 13

    lst = [i for i in range(-l+1, l+1)]
    Rcol= []

    for i in range(2*l):
      if lst[i] < 0:
        col_name = "R_t" + str(lst[i])
        Rcol.append(col_name)
      elif lst[i] == 0:
        col_name = "R_t"
        Rcol.append(col_name)
      else:
        col_name = "R_t+" + str(lst[i])
        Rcol.append(col_name)
    R[Rcol[:l]] = pd.DataFrame(R['R'].tolist(), index=df.index)
    R[Rcol[l:]] = pd.DataFrame(R['R_future'].tolist(), index=df.index)
    R = R.drop(labels = ['R', 'R_future', 'R_t+13'], axis=1)

    return R

In [None]:
### 두번째함수 완성
def RtoP(X_test:pd.DataFrame, y_test_df:pd.DataFrame, y_pred_df:pd.DataFrame, df:pd.DataFrame):
    """
    맨 처음 파일 불러온 직후 dataframe을 df에 넣어주세요! 정규화 전 close, R을 불러오기 위함입니다!
    
    결과:
    dataframe P가 return 되어, 각 observation에 대해 P_t-12, P_t-11, ~ , P_t+12까지의 column에 각각의 P값이 들어 있다. 
    단, index는 초기화된다. 
    """
    # X_test, y_test_df, y_pred_df와 같은 범위의 data를 뽑는다. 
    data = df.copy()
    data = data[data['timestamp'] > '2021-09-01'] ############################## test set 바꾸면 여기도 수정!

    # 계산의 편의성을 위해 'R'에서 R값들을 dataframe 형태로 가져온다. 
    R = RfromDF(data)
    Rpred = y_pred_df.copy()
    # P_t인 close값을 가져온다. 
    P_t = data['close'].values.tolist()

    # P columns 만들기
    l = 13
    lst = [i for i in range(-l+1, l)]
    Pcol= []
    for i in range(2*l-1):
      if lst[i] < 0:
        col_name = "P_t" + str(lst[i])
        Pcol.append(col_name)
      elif lst[i] == 0:
        col_name = "P_t"
        Pcol.append(col_name)
      else:
        col_name = "P_t+" + str(lst[i])
        Pcol.append(col_name)
    # dataframe P를 만들기 위해 P_bucket, P_temp를 이용
    P_bucket = []
    P_temp = []

    pred_bucket = []
    pred_temp = []

    for i in range(len(X_test)):
      P_temp.append(P_t[i])
      pred_temp.append(P_t[i])
      for j in range(1, l):
        P_temp.append((R.iloc[i][f'R_t+{j}'] + 1) * P_temp[-1])
        pred_temp.append((Rpred.iloc[i][j] + 1) * pred_temp[-1])
      for j in range(1, l):
        if j == 1: 
          P_temp.insert(-1, P_temp[0] / (R.iloc[i]['R_t'] + 1))      
        else: 
          P_temp.insert(-1, P_temp[0] / (R.iloc[i][f'R_t-{j}'] + 1))      
      P_bucket.append(P_temp)
      pred_bucket.append(pred_temp)
      P_temp = []
      pred_temp = []
      if i % 10000 == 0:
        print(" real i ", i)


    P = pd.DataFrame(P_bucket, columns=Pcol)
    P_pred = pd.DataFrame(pred_bucket, columns=Pcol[l-1:])

    X_test = pd.concat([X_test.reset_index(drop=True), P.iloc[:, :l]], axis=1)
    y_test_df = pd.concat([y_test_df.reset_index(drop=True), P.iloc[:, l-1:]], axis=1)

    y_pred_df = pd.concat([y_pred_df.reset_index(drop=True), P_pred], axis=1)
    return X_test, y_test_df, y_pred_df

In [None]:
### 세번째함수
from plotly.subplots import make_subplots
import os

def drawGraphCompanyy(X_test:pd.DataFrame, y_test_df:pd.DataFrame, y_pred:np.ndarray, df: pd.DataFrame): 
  # 데이터 준비
  # X_test로부터 ticker, company 정보 꺼내기
  mcle_ticker_df = pd.DataFrame(X_test.copy().iloc[:, 0])
  mcle_ticker_lst = mcle_ticker_df.values.tolist()
  TICKER = mcle.inverse_transform(X_test, 'ticker')
  TICKER = TICKER['ticker'].values.tolist()

  # X_test로부터 'timestamp' column 다시 만들기
  test_timestamp = X_test.loc[:, ['year', 'month', 'day']]
  test_timestamp['timestamp'] = test_timestamp['year'].astype(str) + '-' + test_timestamp['month'].astype(str) + '-' + test_timestamp['day'].astype(str)
  test_timestamp['timestamp'] = pd.to_datetime(test_timestamp['timestamp'])

  # X_test, y_test, y_pred를 그래프 뽑기 편하게 만들어주기 (timestamp 붙이기, ticker decode)
  X_test_graph = X_test.copy()
  X_columns = list(X_test_graph.columns)
  X_test_graph = pd.concat([X_test_graph, test_timestamp['timestamp']], ignore_index=True, axis=1)
  X_test_graph.columns = X_columns + ['timestamp']

  y_test_graph = y_test_df.copy()
  y_columns = list(y_test_graph.columns)
  y_test_graph = pd.concat([y_test_graph, test_timestamp['timestamp']], ignore_index = True, axis=1)
  y_test_graph = pd.concat([y_test_graph, X_test_graph['ticker']], ignore_index = True, axis=1)
  y_test_graph.columns = y_columns+ ['timestamp', 'ticker']

  y_pred_graph = pd.DataFrame(y_pred)
  y_pred_graph = pd.concat([y_pred_graph, test_timestamp['timestamp'].reset_index(drop=True)], ignore_index = True, axis = 1)
  y_pred_graph = pd.concat([y_pred_graph, X_test_graph['ticker'].reset_index(drop=True)], ignore_index=True, axis=1)
  y_pred_graph.columns = [f"P{s}" for s in range(13)] + ['timestamp', 'ticker']

  # # timelist (X of graph)
  X_timelist = X_test_graph['timestamp'].unique().tolist()
  X_timelist.sort()
  y_timelist = y_test_graph['timestamp'].unique().tolist()
  y_timelist.sort()

  # # X, y, pred에 P값 추가
  X_test_graph, y_test_graph, y_pred_graph = RtoP(X_test_graph, y_test_graph, y_pred_graph, df)

  # 그래프 그리기
  unique_TICKER = list(set(TICKER))
  for t in unique_TICKER:
    coIndex = TICKER.index(t)
    co_n = mcle_ticker_lst[coIndex]
    
    # 특정 ticker 골라내기
    X_temp = X_test_graph[X_test_graph['ticker'] == co_n[0]]
    y_temp = y_test_graph[y_test_graph['ticker'] == co_n[0]]
    pred_temp = y_pred_graph[y_pred_graph['ticker'] == co_n[0]]

    # sort
    X_temp = X_temp.sort_values(by='timestamp', ascending=True)
    y_temp = y_temp.sort_values(by='timestamp', ascending=True)
    pred_temp = pred_temp.sort_values(by='timestamp', ascending=True)

    # 그래프를 그릴 P만 골라내기
    X_graph = X_temp.iloc[:, -13:]
    y_graph = y_temp.iloc[:, -13:]
    pred_graph = pred_temp.iloc[:, -13:]

    # X dataset 마련
    Xlist = [0] * (len(X_timelist)-1)
    for i in range(len(X_graph)-l): 
      for j in range(l):
        Xlist[i+j] += X_graph.iloc[i, j]
    Xlist_real = []
    for i in range(len(Xlist)):
      if i < l:
        Xlist_real.append(Xlist[i]/(i+1))
      elif i > len(Xlist) - l:
        Xlist_real.append(Xlist[i]/(len(Xlist)-i))
      else:
        Xlist_real.append(Xlist[i]/l)

    # y dataset 마련
    ylist = [0] * (len(y_timelist)-1)
    for i in range(len(y_graph)-l): 
      for j in range(l):
        ylist[i+j] += y_graph.iloc[i, j]
    ylist_real = []
    for i in range(len(ylist)):
      if i < l:
        ylist_real.append(ylist[i]/(i+1))
      elif i > len(ylist) - l:
        ylist_real.append(ylist[i]/(len(ylist)-i))
      else:
        ylist_real.append(ylist[i]/l)

    # pred dataset 마련
    predlist = [0] * (len(y_timelist)-1)
    for i in range(len(pred_graph)-l): 
      for j in range(l):
        predlist[i+j] += pred_graph.iloc[i, j]
    predlist_real = []
    for i in range(len(predlist)):
      if i < l:
        predlist_real.append(predlist[i]/(i+1))
      elif i > len(predlist) - l:
        predlist_real.append(predlist[i]/(len(predlist)-i))
      else:
        predlist_real.append(predlist[i]/l)

    # 그래프 그리기
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.update_layout(title_text=t, title_font_size=20)

    Xrange = [i for i in range(len(X_timelist))]
    yrange = [i + l - 1 for i in range(len(y_timelist))]

    # Add traces
    fig.add_trace(
        go.Scatter(x=Xrange, y=Xlist_real, name="X_test"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=yrange, y=ylist_real, name="y_test"),
        secondary_y=False,
        #secondary_y=True,
    )

    fig.add_trace(
        go.Scatter(x=yrange, y=predlist_real, name="y_pred"),
        secondary_y=False,
        #secondary_y=True,
    )
    if not os.path.exists('/content/drive/MyDrive/graphP3'):
      os.makedirs('/content/drive/MyDrive/graphP3')
    fig.write_image(f"/content/drive/MyDrive/graphP3/{t}.png", engine='kaleido')

In [None]:
drawGraphCompanyy(final_test_X, final_test_y, y_pred, df_)

 real i  0
 real i  10000
 real i  20000
 real i  30000
