In [None]:
# # RSI 
# 가격이 전일 가격보다 상승한 날의 상승분은 U(up) 값이라고 하고,
# 가격이 전일 가격보다 하락한 날의 하락분은 D(down) 값이라고 한다.
# U값과 D값의 평균값을 구하여 그것을 각각 AU(average ups)와 AD(average downs)라 한다.
# AU를 AD값으로 나눈 것을 RS(relative strength) 값이라고 한다. RS 값이 크다는 것은 일정 기간 하락한 폭보다 상승한 폭이 크다는 것을 의미한다.
# 다음 계산에 의하여 RSI 값을 구한다.

# RSI 계산 공식 :

# RSI = RS / (1 + RS)

# 또는, 다음과 같이 구해도 결과는 동일하다.

# RSI = AU / (AU + AD)

# 대체로 이 값은 백분율로 나타낸다.

# 이 지표의 파라메터로는 기간을 며칠 동안으로 할 것인가가 있다. Welles Wilder는 14일을 사용할 것을 권유했다. 대체로 사용되는 값은 9일, 14~15일, 25~28일 등이다.

# RSI 그래프는 이동평균선을 함께 나타내는 것이 보통이며, 이동평균선을 며칠선으로 할 것인가 역시 파라메터로 주어진다. RSI를 15일에 대하여 구하고 5일 이동평균선을 함께 표시하는 경우 그래프에 (15, 5)라고 표시해주는 것이 일반적이다.

# 유사한 지표로는 스토캐스틱이 있다. RSI 그래프의 형태는 fast stochastic과 비슷하게 나온다.

# 스토캐스틱 %K = (현재가격 - N일중 최저가)/(N일중 최고가 - N일중 최저가) * 100
# 스토캐스틱 %D = m일 동안 %K 평균 = Slow %K
# 이는 패스트 스토캐스틱 : 슬로우 캐스틱도 고려해볼것

> 기본적 전처리

In [9]:
pip install -U finance-datareader


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting finance-datareader
  Downloading finance_datareader-0.9.33-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 2.8 MB/s 
Collecting requests-file
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, finance-datareader
Successfully installed finance-datareader-0.9.33 requests-file-1.5.1


In [10]:
pip install tensorflow==2.8.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 32.4 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109


In [6]:
import FinanceDataReader as fdr
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn import ensemble
from sklearn.svm import SVC, SVR
from sklearn import ensemble
from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import classification_report


from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier 
from collections import Counter

In [5]:
ETF_dict = {'KODEX_200' : '069500', '코스닥150' : '232080', 'S&P500' : '143850', 'Euro_stoxx' : '195930', 'Nikkei225' : '238720', 'CSI300' : '192090', 
           'Gold' : '132030', 'WTI' : '130680', 'KODEX_Inverse' : '114800', 'Dollar' : '138230', 'Dollar_Inverse' : '139660'}

In [12]:
def get_RSI_14(data):
    RSI_list = []
    for i in range(15, len(data)):  # 15행 종가부터 시작
        close = list(data.iloc[i - 14 : i + 1]['Close']) # [23665, 23572, 23676, ...]
        positive = []
        negative = []
        for j in range(14):
            diff = close[j + 1] - close[j]
            if diff >= 0:
                positive.append(diff)
            else:
                negative.append(diff)
        
        AU = np.sum(positive) / 13
        AD = abs(np.sum(negative) / 13)
        RSI = AU / (AU + AD)
        RSI_list.append(RSI)
        
    while len(RSI_list) != len(data):
        RSI_list.insert(0, 0)

    return RSI_list

def stocastic_k(data):   # 14 days
    null_list = []
    for i in range(len(data)):
        calculate_low = np.array(data['Low'][i - 13 : i + 1])
        calculate_high = np.array(data['High'][i - 13 : i + 1])
        if str(calculate_low.mean()) == 'nan':
            continue
        else:
            today = data.iloc[i]['Close']
            mini = calculate_low.min()
            high = calculate_high.max()
            null_list.append((today - mini) / (high - mini))
        
    while len(null_list) != len(data):
        null_list.insert(0, 0)
    
    return null_list


def Bollinger(data):    # 20 , 2
    null_list = []
    for i in range(len(data)):
        cal_list = data['Close'][i - 19 : i + 1]
        high = cal_list.mean() + np.std(cal_list) * 2
        low = cal_list.mean() - np.std(cal_list) * 2
        position = (data['Close'][i] - low) /  (high - low)
        null_list.append(position)
    return null_list

def last_day(code, year):
    day = str(fdr.DataReader(str(code), str(year), str(year + 1)).index[-1])[:10]
    print(day)
    
    return day

def start_day(code, year):
    day = str(fdr.DataReader(str(code), str(year), str(year + 1)).index[0])[:10]
    print(day)
    
    return day

def last_day_month(code, year, month):
    day = str(fdr.DataReader(str(code), str(year) + '.' + str(month), str(year) + '.' + str(month + 1)).index[-1])[:10]
    
    return day
def start_day_month(code, year, month):
    day = str(fdr.DataReader(str(code), str(year) + '.' + str(month), str(year) + '.' + str(month + 1)).index[0])[:10]    
    
    return day

def MACD_cat(data):
    null_list = []
    for i in range(1, len(data)):
        if (data['MACD'][i] > 0) & (data['MACD'][i - 1] < 0):
            null_list.append(1)
        else:
            null_list.append(0)
    null_list.insert(0, 0)
    return null_list

# def MACD_ocs(data):
#     null_list = []
#     null_list2 = []
#     cont_list = data['MACD'] - data['MACD_SIGNAL']
#     for i in range(1, cont_list):
        

In [13]:
def all(data):
    data['RSI'] = get_RSI_14(data)
    data['STOCASTIC_K'] = stocastic_k(data)
    data['STOCASTIC_D'] = data['STOCASTIC_K'].ewm(span = 5).mean()  # 5일
    data['Bollinger'] = Bollinger(data)
    data['MACD'] = data['Close'].ewm(span = 12).mean() - data['Close'].ewm(span = 26).mean()
    data['MACD_SIGNAL'] = data['MACD'].ewm(span = 9).mean()
    data['MACD_cat'] = MACD_cat(data)
    data['Change+'] = list((data['Change'] > 0)[1 : len(data)].astype(int)) + [0]
    
    data['RSI_delta'] = data.RSI.diff().fillna(0)
#     data['K_delta'] = data.STOCASTIC_K.diff().fillna(0)
    data['D_delta'] = data.STOCASTIC_D.diff().fillna(0)
    data['sto_diff'] = data['STOCASTIC_K'] - data['STOCASTIC_D']
    data['B_delta'] = data.Bollinger.diff().fillna(0)
    data['MACD_delta'] = data.MACD.diff().fillna(0)
#     ma20 = new_gs['Adj Close'].rolling(window=20).mean()
    data['MA5'] = data['Close'].rolling(window=5).mean()
    data['MA20'] = data['Close'].rolling(window=20).mean()
    data['MA5_adj'] = (data['MA5'] - data['Close']) / data['Close']
    data['MA20_adj'] = (data['MA20'] - data['Close']) / data['Close']
    data['MA_diff'] = (data['MA5'] - data['MA20']) / data['Close']

In [14]:
df_kospi = fdr.DataReader('069500', '2000', '2022')
all(df_kospi)
df_kospi.dropna(inplace = True)
df_kospi = df_kospi[29 : ]            # 일종의 문법

In [15]:
df_kospi3 = fdr.DataReader('069500', '2021.12.05', '2023')
all(df_kospi3)
df_kospi3 = df_kospi3.iloc[26 : ]

In [None]:
# result2 = pd.concat([df1,df2], ignore_index=True)
from pykrx import stock
stock_code = stock.get_market_ticker_list(date="20201020", market="KOSPI")

np.random.shuffle(stock_code)

company_list = stock_code[:300]

j = 1
for i in company_list:
    print(j)
    data = fdr.DataReader(i, '2000', '2021')
    if len(data) < 30:
        continue
    else:
        all(data)
        data = data.dropna()
        df_kospi = pd.concat([df_kospi, data[29:]])
    j += 1

df_kospi.dropna(inplace = True)

df_kospi.replace([np.inf, -np.inf], np.nan, inplace = True)
# df_kospi.replace([np.inf, -np.inf], np.nan).dropna(axis=0, inplace = True)
df_kospi.dropna(inplace = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [None]:
# df_kospi.to_csv('shuffle_300.csv')

In [16]:
df_kospi.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Change,RSI,STOCASTIC_K,STOCASTIC_D,Bollinger,...,RSI_delta,D_delta,sto_diff,B_delta,MACD_delta,MA5,MA20,MA5_adj,MA20_adj,MA_diff
count,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,...,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0,4707.0
mean,20251.780327,20375.659018,20115.175271,20259.09624,5147068.0,0.000478,0.5429,0.599492,0.59934,0.56367,...,1e-05,7.6e-05,0.000152,2.9e-05,0.031486,20245.016444,20192.721202,-0.000605,-0.002892,0.002287
std,8110.358395,8143.927497,8070.77135,8100.922487,4463771.0,0.013361,0.173004,0.313091,0.264419,0.329971,...,0.065819,0.068467,0.136934,0.159453,31.883596,8093.415127,8067.574659,0.0147,0.032667,0.02538
min,4868.0,4886.0,4692.0,4750.0,178565.0,-0.118635,0.023613,0.000309,0.043712,-0.446418,...,-0.311843,-0.268086,-0.536173,-0.869866,-262.534722,4849.0,4991.25,-0.114969,-0.116164,-0.226882
25%,14318.0,14401.0,14189.0,14319.5,1694748.0,-0.005578,0.419207,0.322359,0.367655,0.301382,...,-0.037939,-0.039815,-0.079631,-0.084097,-15.872927,14315.4,14249.925,-0.008911,-0.021939,-0.009186
50%,21204.0,21301.0,21091.0,21220.0,4321219.0,0.000765,0.545557,0.657627,0.650349,0.620887,...,-4.3e-05,0.004439,0.008878,-0.002032,1.894383,21209.8,21256.85,-0.001782,-0.006445,0.005099
75%,24118.5,24243.0,24019.0,24131.5,7231939.0,0.007075,0.665735,0.88814,0.840977,0.835883,...,0.038595,0.04293,0.085861,0.094285,16.507119,24148.5,24067.425,0.006568,0.012383,0.017412
max,43167.0,43758.0,43050.0,43145.0,62226540.0,0.149422,0.99751,1.060606,0.998166,1.266308,...,0.259859,0.259278,0.518557,0.678688,211.835161,43004.4,42731.7,0.187374,0.37673,0.086153


 > 모델링 : 실험1 (기본적 전처리만, 2010~2021 -> 2022) : 57%

In [None]:
# train_x = df_x.loc[: last_day('069500', 2021)]
# train_y = df_y.loc[: last_day('069500', 2021)]
# test_x = df_x.loc[start_day('069500', 2022) :]
# test_y = df_y.loc[start_day('069500', 2022) :]

In [None]:
# # regularization candiate 정의
# reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

# # space 정의, Hyperparameter의 이름을 key 값으로 입력
# space={'max_depth': hp.quniform("max_depth", 4, 20, 2),
#        'learning_rate': hp.quniform ('learning_rate', 0.001, 0.01, 0.001),
#        'reg_alpha' : hp.choice('reg_alpha', reg_candidate),
#        'reg_lambda' : hp.choice('reg_lambda', reg_candidate),
#        'subsample': hp.quniform('subsample', 0.6, 1, 0.05),
#        'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.05),
#        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
#        'n_estimators': hp.quniform('n_estimators', 300, 3000, 300)}

# # 목적 함수 정의
# # n_estimators, max_depth와 같은 반드시 int 타입을 가져야 하는 hyperparamter는 int로 타입 캐스팅 합니다.
# def hyperparameter_tuning(space):
#     model=XGBClassifier(n_estimators =int(space['n_estimators']), 
#                        max_depth = int(space['max_depth']), 
#                        learning_rate = space['learning_rate'],
#                        reg_alpha = space['reg_alpha'],
#                        reg_lambda = space['reg_lambda'],
#                        subsample = space['subsample'],
#                        colsample_bytree = space['colsample_bytree'], 
#                        min_child_weight = int(space['min_child_weight']),
#                        random_state=42,)
    
#     evaluation = [(train_x, train_y), (test_x, test_y)]
    
#     model.fit(train_x, train_y,
#               eval_set=evaluation, 
#               eval_metric='auc',
#               verbose=0)

#     accuracy = accuracy_score(model.predict(test_x), test_y)
#     # 평가 방식 선정
#     return {'loss': 1 - accuracy, 'status': STATUS_OK, 'model': model}

# # Trials 객체 선언합니다.
# trials = Trials()
# # best에 최적의 하이퍼 파라미터를 return 받습니다.
# best = fmin(fn=hyperparameter_tuning,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=50, # 최대 반복 횟수를 지정합니다.
#             trials=trials)

# # 최적화된 결과를 int로 변환해야하는 파라미터는 타입 변환을 수행합니다.
# best['max_depth'] = int(best['max_depth'])
# best['min_child_weight'] = int(best['min_child_weight'])
# best['n_estimators'] = int(best['n_estimators'])
# best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
# best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
# print (best)

> 실험 2 : 2022 -> 2022

In [None]:
# start_day_month('069500', 2022, 1)
# last_day_month('069500', 2022, 1)

In [None]:
# train_x = df_x.loc[start_day_month('069500', 2022, 1) : last_day_month('069500', 2022, 4)]
# train_y = df_y.loc[start_day_month('069500', 2022, 1) : last_day_month('069500', 2022, 4)]
# test_x = df_x.loc[start_day_month('069500', 2022, 5) : last_day_month('069500', 2022, 5)]
# test_y = df_y.loc[start_day_month('069500', 2022, 5) : last_day_month('069500', 2022, 5)]

In [None]:
# model = XGBClassifier()
# model.fit(train_x, train_y)
# accuracy_score(model.predict(train_x), train_y) 

In [None]:
# model_cat = CatBoostClassifier()
# model_cat.fit(train_x, train_y)

In [None]:
# accuracy_score(model.predict(test_x), test_y) 

> 실험 3 : Standard Scaling

<!-- numeric_features = list(train_x.columns)
numeric_transformer = StandardScaler()

categorical_features = []
categorical_transformer = OneHotEncoder(categories='auto')

preprocessor = ColumnTransformer(
    transformers = [ ('num', numeric_transformer, numeric_features),
        ('passthrough', 'passthrough', categorical_features)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(train_x)

x_train_transformed = preprocessor_pipe.transform(train_x)
x_test_transformed = preprocessor_pipe.transform(test_x) -->

In [None]:
# list(train_x.columns)

NameError: name 'train_x' is not defined

In [None]:
# numeric_features = list(train_x.columns)
# numeric_transformer = StandardScaler() # cf) RobustScaler

# categorical_features = []
# categorical_transformer = OneHotEncoder(categories='auto') # categories='auto' : just for ignoring warning messages

# preprocessor = ColumnTransformer(
#     transformers = [ ('num', numeric_transformer, numeric_features),
#         ('passthrough', 'passthrough', categorical_features)])

# preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# preprocessor_pipe.fit(train_x)

# x_train_transformed = preprocessor_pipe.transform(train_x)
# x_test_transformed = preprocessor_pipe.transform(test_x)


In [None]:
# model = XGBClassifier(learning_rate=0.001,max_depth=5,n_estimators=100)
# model.fit(x_train_transformed, train_y)
# accuracy_score(model.predict(x_test_transformed), test_y) 

> 실험 4 :

In [None]:
# train_x = df_x.loc[: last_day('069500', 2020)]
# train_y = df_y.loc[: last_day('069500', 2020)]
# test_x = df_x.loc[start_day('069500', 2021) : last_day('069500', 2021)]
# test_y = df_y.loc[start_day('069500', 2021) : last_day('069500', 2021)]

In [17]:
train_x = df_kospi[['RSI', 'STOCASTIC_K', 'STOCASTIC_D', 'Bollinger', 'MACD', 'MACD_cat', 'MACD_SIGNAL', 'RSI_delta', 'D_delta', 'sto_diff', 'B_delta', 'MACD_delta',
                   'MA5', 'MA20', 'MA5_adj', 'MA20_adj']]
train_y = df_kospi['Change+']
test_x = df_kospi3[['RSI', 'STOCASTIC_K', 'STOCASTIC_D', 'Bollinger', 'MACD', 'MACD_cat', 'MACD_SIGNAL', 'RSI_delta', 'D_delta', 'sto_diff', 'B_delta', 'MACD_delta',
                   'MA5', 'MA20', 'MA5_adj', 'MA20_adj']]
test_y = df_kospi3['Change+']

In [18]:
numeric_features = ['RSI', 'STOCASTIC_K', 'STOCASTIC_D', 'Bollinger', 'MACD', 'MACD_SIGNAL', 'RSI_delta', 'D_delta', 'sto_diff', 'B_delta', 'MACD_delta', 'MA5', 'MA20',
                   'MA5_adj', 'MA20_adj']
numeric_transformer = StandardScaler() # cf) RobustScaler

categorical_features = ['MACD_cat']
categorical_transformer = OneHotEncoder(categories='auto') # categories='auto' : just for ignoring warning messages

preprocessor = ColumnTransformer(
    transformers = [ ('num', numeric_transformer, numeric_features),
        ('passthrough', categorical_transformer, categorical_features)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(train_x)

train_x = preprocessor_pipe.transform(train_x)
test_x = preprocessor_pipe.transform(test_x)

In [None]:
cat = CatBoostClassifier()                                    # 학습 데이터 더 많아야 됨
cat.fit(train_x, train_y)
accuracy_score(cat.predict(test_x), test_y)

In [None]:
accuracy_score(cat.predict(train_x), train_y)

0.5810779595633218

In [None]:
pred_train = xgb.predict_proba(test_x)

test_x

fpr, tpr, _ = roc_curve(y_true=test_y, y_score=pred_train[:,1])
roc_auc = auc(fpr, tpr)
print('Roc_AUC : ', roc_auc)  

plt.figure(figsize=(10, 10))

plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title("ROC curve")

plt.show()

ft_importance_values = xgb.feature_importances_

ft_importance_values

In [None]:
train_x.shape

(1458923, 16)

> LSTM을 쓰면;?

In [33]:
df_kospi = fdr.DataReader('069500', '2000', '2022')
# all(df_kospi)
df_kospi.dropna(inplace = True)
df_kospi['Change+'] = list((df_kospi['Change'] > 0)[1 : len(df_kospi)].astype(int)) + [0]
df_kospi = df_kospi[:-2]   

train_x = df_kospi[['Close', 'Volume']]
train_y = df_kospi['Change+']


df_kospi = fdr.DataReader('069500', '2021.12.01', '2023')
# all(df_kospi)
df_kospi.dropna(inplace = True)
df_kospi['Change+'] = list((df_kospi['Change'] > 0)[1 : len(df_kospi)].astype(int)) + [0]
df_kospi = df_kospi[:-2]            # 일종의 문법


test_x = df_kospi[['Close', 'Volume']]
test_y = df_kospi['Change+']



In [34]:
x_train = []
y_train = []

for i in range(20, len(train_x)):
    x_train.append(np.array(train_x[i - 20 : i]))
    y_train.append(list(train_y)[i - 1])

In [35]:
x_train, y_train = np.array(x_train), np.array(y_train)

In [36]:
x_train.shape

(4732, 20, 2)

In [37]:
print(x_train.shape)
print(y_train.shape)

(4732, 20, 2)
(4732,)


In [38]:
x_test = []
y_test = []

for i in range(20, len(test_x)):
    x_test.append(test_x[i-20:i])
    y_test.append(test_y[i - 1])
x_test, y_test = np.array(x_test), np.array(y_test)

print(x_test.shape)
print(y_test.shape)

(99, 20, 2)
(99,)


In [62]:
num = np.unique(y_train, axis=0)
num = num.shape[0]
y_train = np.eye(num)[y_train]

num = np.unique(y_test, axis=0)
num = num.shape[0]
y_test = np.eye(num)[y_test]

IndexError: arrays used as indices must be of integer (or boolean) type

In [50]:
# numeric_features = ['RSI', 'STOCASTIC_K', 'STOCASTIC_D', 'Bollinger', 'MACD', 'MACD_SIGNAL', 'RSI_delta', 'D_delta', 'sto_diff', 'B_delta', 'MACD_delta', 'MA5', 'MA20',
#                    'MA5_adj', 'MA20_adj']
# numeric_transformer = StandardScaler() # cf) RobustScaler

# categorical_features = ['MACD_cat']
# categorical_transformer = OneHotEncoder(categories='auto') # categories='auto' : just for ignoring warning messages

# preprocessor = ColumnTransformer(
#     transformers = [ ('num', numeric_transformer, numeric_features),
#         ('passthrough', categorical_transformer, categorical_features)])

# preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# preprocessor_pipe.fit(train_x)

# train_x = preprocessor_pipe.transform(train_x)
# test_x = preprocessor_pipe.transform(test_x)

In [7]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM
import tensorflow as tf

In [39]:
model = Sequential()
model.add(LSTM(50,return_sequences=True, input_shape=(x_train.shape[1],x_train.shape[2])))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [40]:
model.fit(x_train, y_train, batch_size=40, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x23e6e304d90>

> 튜닝?

In [28]:
pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 8.4 MB/s 
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.1.2 kt-legacy-1.0.4


In [41]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from tensorflow import keras 
from tensorflow.keras import layers

import keras_tuner as kt
import numpy as np
import IPython
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten

In [64]:
def model_builder(hp):
    

    model = keras.Sequential()
    model.add(LSTM(hp.Int('input_unit',min_value=32,max_value=512,step=32),return_sequences=True, input_shape=(x_train.shape[1],x_train.shape[2])))
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(LSTM(hp.Int(f'lstm_{i}_units',min_value=32,max_value=512,step=32),return_sequences=True))
    model.add(Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.5,step=0.1)))

    # model.add(Dense(25, activation=hp.Choice('dense_activation',values=['relu', 'sigmoid'],default='relu')))

    model.add(Flatten())
    model.add(Dense(2, activation=hp.Choice('dense_activation',values=['softmax'],default='softmax')))

    


  # Tune the learning rate for the optimizer 
  # Choose an optimal value from 0.01, 0.001, or 0.0001

    model.compile(optimizer = keras.optimizers.Adam(),
                loss = 'binary_crossentropy', 
                metrics = ['accuracy'])

    return model

In [65]:
tuner = kt.BayesianOptimization(model_builder,
                                objective = 'val_loss', # Hyper-params tuning을 위한 목적함수 설정 (metric to minimize or maximize)
                                max_trials = 50, # 서로 다른 Hyper-params 조합으로 시도할 총 Trial 횟수 설정
                                directory = 'models', # Path to the working directory
                                project_name = 'mymodel2'
                                ) # Name to use as directory name for files saved by this Tuner
tuner.search_space_summary()

Search space summary
Default search space size: 5
input_unit (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': None}
n_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': None}
lstm_0_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': None}
Dropout_rate (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': None}
dense_activation (Choice)
{'default': 'softmax', 'conditions': [], 'values': ['softmax'], 'ordered': False}


In [67]:
tuner.search(
        x=x_train,
        y=y_train,
        epochs=20,
        batch_size=32,
        validation_data=(x_test,y_test),
)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
96                |?                 |input_unit
3                 |?                 |n_layers
416               |?                 |lstm_0_units
0.1               |?                 |Dropout_rate
softmax           |?                 |dense_activation

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

KeyboardInterrupt: 

In [66]:
models = tuner.get_best_models() # Keras Sequential models
top_model = models[0]
top_model.summary()
print()

results = top_model.evaluate(x_test, y_test)
print('Cross-entropy :', results[0])
print('Accuracy :', results[1])

IndexError: list index out of range