# 베이스라인 모델

In [1]:
import xml.etree.ElementTree as elemTree
import os
import sys
tree = elemTree.parse(r'../config/.config.xml')
root = tree.getroot()
xx = root.find('./PATHS')
work_path = xx.find('work').text
sys.path.append(work_path)

In [80]:
import numpy
import talib
from miscs.config_manager import ConfigManager
import pandas as pd
from sqlalchemy import create_engine
from tqdm.auto import tqdm
from realtime_kiwoom.data_provider import *
import re
import numpy as np

In [3]:
cm = ConfigManager('../config/.config.xml')

# 데이터 로딩

In [4]:
history_provider = MinuteChartDataProvider.Factory(cm, tag='history')

In [331]:
%%time
history_minute_dic = history_provider.get_history_from_ndays_ago(n_days=365)

CPU times: total: 1min 13s
Wall time: 1min 14s


# 기술적 지표

MACD

RSI

AD Line

In [269]:
def make_basic_features(df: pd.DataFrame):
  """
  df가 변형됨
  """
  macd, macdsignal, macdhist = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)
  rsi = talib.RSI(df['close'], timeperiod=14)
  ad = talib.AD(df['high'], df['low'], df['close'], df['volume'])

  df['macd'] = macd
  df['macdsignal'] = macdsignal
  df['macdhist'] = macdhist
  df['rsi'] = rsi
  df['ad'] = ad

  df['offset_intra_day'] = ((df.index - df.index.floor('D') - pd.Timedelta('9h')).total_seconds()/(60*60*6.5)).values


In [270]:
def make_window_features(df: pd.DataFrame, cols=['macd', 'macdsignal', 'macdhist', 'rsi', 'ad', 'close'], window_size=10):
  """
  df가 변형됨
  """
  for col in cols:
    df[f'{col}_w'] = df[col].rolling(window=window_size).mean() / df[col]

In [271]:
def make_binary_dt_features(df: pd.DataFrame):
  """
  df가 변형됨
  """
  ss = df.reset_index()
  ss.close
  df['ts_end'] = ss.dt.shift(-1).apply(lambda x: x.hour == 9 and x.minute == 0).values
  df['ts_start'] = ss.dt.apply(lambda x: x.hour == 9 and x.minute == 0).values

In [272]:
def make_binary_close_indicators(df: pd.DataFrame):
  """
  df가 변형됨: 현재 가격이 어제 종가보다 큰지?
  """
  dic = {name:group.iloc[-1].close for name, group in df.groupby(df.index.strftime('%Y-%m-%d'))}
  df['is_higher'] = pd.Series(df.index.strftime('%Y-%m-%d').map(dic).values, index=df.index).shift(1) < df.close
  

In [273]:
def make_binary_indicators(df: pd.DataFrame):
  make_binary_dt_features(df)
  make_binary_close_indicators(df)

In [274]:
def make_target(df: pd.DataFrame, window_size=10):
  """
  df가 변형됨
  close의 내일 ~ window_size 까지의 가격 변화율을 target으로 함
  """
  df['target'] = df.close.rolling(window=window_size).mean().shift(-window_size) /df.close

In [290]:
for code, df in history_minute_dic.items():
  make_basic_features(df)
for code, df in history_minute_dic.items():
  make_window_features(df)
for code, df in history_minute_dic.items():
  make_binary_indicators(df)
for code, df in history_minute_dic.items():
  make_target(df, window_size=60)

In [285]:
new_cols = ['macd_w', 'macdsignal_w', 'macdhist_w', 'rsi_w', 'ad_w', 'close_w', 'ts_end', 'ts_start', 'is_higher', 'offset_intra_day', 'target']

In [291]:
compact_minute_dic = {code:df[new_cols] for code, df in history_minute_dic.items()}

In [293]:
merged_df = pd.merge(compact_minute_dic['069500'], compact_minute_dic['114800'], left_index=True, right_index=True, suffixes=('_x', '_y'))

In [302]:
merged_df['label'] = 'NOP'
merged_df.loc[(merged_df.target_x > merged_df.target_y) & (merged_df.target_x > 1.001) & (merged_df.target_y < 0.999), 'label'] = 'X'
merged_df.loc[(merged_df.target_x < merged_df.target_y) & (merged_df.target_x < 0.999) & (merged_df.target_y > 1.001), 'label'] = 'Y'
merged_df['label'] = merged_df.label.astype('category')

편의를 위해 pkl 저장

In [333]:
merged_df.to_pickle('.merged_for_baseline_df.pkl')

yy.label.cat.codes
- category 를 숫자 코드 값으로 조회 가능

SKTIME

In [29]:
from sktime.datatypes import check_raise
from sktime.utils.plotting import plot_series
from sktime.transformations.panel.reduce import Tabularizer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sktime.forecasting.compose import ForecastingPipeline
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.transformations.series.impute import Imputer

In [17]:
tb = Tabularizer()

In [19]:
df = history_minute_dic['069500'][-50:]

In [31]:
df[:40]

Unnamed: 0_level_0,st_code,open,high,low,close,volume,macd,macdsignal,macdhist,rsi,ad
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-09-30 14:31:00+09:00,69500,28475,28480,28455,28455,5756,0.832432,11.307156,-10.474724,40.557855,-631995.607565
2022-09-30 14:32:00+09:00,69500,28460,28460,28445,28445,3444,-2.840058,8.477713,-11.317771,38.412153,-635439.607565
2022-09-30 14:33:00+09:00,69500,28445,28450,28410,28410,10261,-8.477023,5.086766,-13.563789,32.02587,-645700.607565
2022-09-30 14:34:00+09:00,69500,28410,28430,28400,28420,27748,-11.999123,1.669588,-13.668711,35.333925,-636451.274231
2022-09-30 14:35:00+09:00,69500,28425,28430,28395,28410,7679,-15.419582,-1.748246,-13.671336,33.574298,-637548.274231
2022-09-30 14:36:00+09:00,69500,28415,28425,28400,28400,6131,-18.721428,-5.142882,-13.578545,31.865338,-643679.274231
2022-09-30 14:37:00+09:00,69500,28400,28420,28395,28420,3761,-19.499553,-8.014216,-11.485336,38.597097,-639918.274231
2022-09-30 14:38:00+09:00,69500,28420,28430,28415,28425,2063,-19.488117,-10.308997,-9.179121,40.188106,-639230.607565
2022-09-30 14:39:00+09:00,69500,28425,28425,28405,28420,2909,-19.655931,-12.178383,-7.477548,39.097134,-637776.107565
2022-09-30 14:40:00+09:00,69500,28415,28425,28410,28415,3521,-19.962271,-13.735161,-6.22711,37.986603,-638949.774231


In [27]:
check_raise(df[['macd', 'rsi']], 'pd.DataFrame')

True

In [28]:
tb.fit_transform(df[['macd', 'rsi']])

  for _label, _series in multi_ind_dataframe.iteritems():  # noqa


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.832432,-2.840058,-8.477023,-11.999123,-15.419582,-18.721428,-19.499553,-19.488117,-19.655931,-19.962271,...,42.30603,36.223119,31.366249,35.987803,33.554555,32.001131,32.001131,28.213984,34.855945,45.680851


In [30]:
im = Imputer(method="mean")

In [None]:
history_minute_dic['069500'][:60]

In [34]:
im.fit_transform(history_minute_dic['069500'][:60][['macd', 'rsi']])

Unnamed: 0_level_0,macd,rsi
dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-08-04 09:00:00+09:00,-0.458191,46.226163
2022-08-04 09:01:00+09:00,-0.458191,46.226163
2022-08-04 09:02:00+09:00,-0.458191,46.226163
2022-08-04 09:03:00+09:00,-0.458191,46.226163
2022-08-04 09:04:00+09:00,-0.458191,46.226163
2022-08-04 09:05:00+09:00,-0.458191,46.226163
2022-08-04 09:06:00+09:00,-0.458191,46.226163
2022-08-04 09:07:00+09:00,-0.458191,46.226163
2022-08-04 09:08:00+09:00,-0.458191,46.226163
2022-08-04 09:09:00+09:00,-0.458191,46.226163


In [35]:
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sklearn.preprocessing import MinMaxScaler
from sktime.datasets import load_airline
y = load_airline()
transformer = TabularToSeriesAdaptor(MinMaxScaler())
y_hat = transformer.fit_transform(y)

In [36]:
y

Period
1949-01    112.0
1949-02    118.0
1949-03    132.0
1949-04    129.0
1949-05    121.0
           ...  
1960-08    606.0
1960-09    508.0
1960-10    461.0
1960-11    390.0
1960-12    432.0
Freq: M, Name: Number of airline passengers, Length: 144, dtype: float64

In [37]:
y_hat

Period
1949-01    0.015444
1949-02    0.027027
1949-03    0.054054
1949-04    0.048263
1949-05    0.032819
             ...   
1960-08    0.969112
1960-09    0.779923
1960-10    0.689189
1960-11    0.552124
1960-12    0.633205
Freq: M, Length: 144, dtype: float64

In [None]:
from sktime.forecasting.base import ForecastingHorizon

In [None]:
fh = ForecastingHorizon(
    pd.PeriodIndex(pd.date_range("1961-01", periods=36, freq="M")), is_relative=False
)
fh

In [None]:
cutoff = pd.Period("1960-12", freq="M")

In [None]:
fh.to_relative(cutoff)

In [None]:
fh.to_absolute(cutoff)

In [None]:
from sktime.forecasting.naive import NaiveForecaster

In [None]:
forecaster = NaiveForecaster(strategy="last")

In [None]:
forecaster.fit(y)

In [None]:
y_pred = forecaster.predict(fh)

In [None]:
plot_series(y, y_pred, labels=["y", "y_pred"])