In [1]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Lambda, Input, GlobalAveragePooling1D, Bidirectional
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
def get_model(input_shape):
    input = Input(shape=input_shape)
    x = LSTM(128, return_sequences=True, activation='tanh', dropout=0.2)(input)
    x = LSTM(64, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = LSTM(32, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model

WINDOW_SIZE = 6
feature_n = 6

Input_shape = (WINDOW_SIZE, feature_n)
model = get_model(Input_shape)
model.load_weights('../checkpoints/ckeckpointer.ckpt')

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6, 6)]            0         
_________________________________________________________________
lstm (LSTM)                  (None, 6, 128)            69120     
_________________________________________________________________
lstm_1 (LSTM)                (None, 6, 64)             49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 6, 32)             12416     
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 130,977
Trainable params: 130,977
Non-trainable params: 0
_______________________________________________________

In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pyupbit
from sklearn.preprocessing import MinMaxScaler

import os
from glob import glob
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



class Data_preprocess():
    def __init__(self, args):
        if not hasattr(args, 'data') or args.data is None:
            print(args)
            self.data, self.label, self.dataset = self.preprocess(
                pyupbit.get_ohlcv(ticker=args.ticker, interval=args.interval, to=args.to, count=args.count))
        else :
            self.data, self.label, self.dataset = self.csv_parsing(args.data)

    def MinMax(self, dataset_df):
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(dataset_df)
        return pd.DataFrame(norm_dataset, columns=list(dataset_df.columns))

    def add_after10(self, dataset_df):
        after10 = np.zeros_like(self.norm_dataset['close'])
        for i in range(len(dataset_df['close']) - 1):
            after10[i] = dataset_df['close'][i + 1]
        return after10

    def drop_feature(self, dataset_df):
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
        dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df

    def add_avgPrice(self, dataset_df):
        return (dataset_df['high'] + dataset_df['low'] +
                dataset_df['open'] + dataset_df['close']) // 4

    def preprocess(self, dataset, latest=False):

        # drop feature
        #dataset_df = self.drop_feature(dataset)
        dataset_df = dataset

        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)

        if latest == True:
            # 가장 예전 데이터 삭제 - norm이랑 original 둘 다 적용
            self.dataset = self.dataset.drop([self.dataset.index[0]]).drop(columns=['after10'])
            self.norm_dataset = self.norm_dataset.drop([self.norm_dataset.index[0]])

            # ori dataset에 추가
            self.dataset = pd.concat([self.dataset, dataset_df])
            self.dataset = self.dataset.reset_index(drop=True)

            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(self.dataset)

            # after10 추가
            self.dataset['after10'] = self.add_after10(self.dataset)


        else:
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(dataset_df)

            # after10 추가
            dataset_df['after10'] = self.add_after10(dataset_df)

        # 예측될 값(label)인 10분 후 가격
        self.norm_dataset['after10'] = self.add_after10(self.norm_dataset)

        return self.norm_dataset.drop(columns=['after10']), self.norm_dataset['after10'], dataset_df

    def csv_parsing(self, data_path):
        merge_df = pd.DataFrame()
        data_folders = glob(os.path.join(data_path, '*'))

        for data_folder in tqdm(data_folders):
            data_csvs = glob(os.path.join(data_folder,'*.csv'))

            for data_csv in data_csvs :
                csv_df = pd.read_csv(data_csv).drop(columns=["Unnamed: 0"])
                merge_df = pd.concat([merge_df, csv_df], ignore_index=True)

        return self.preprocess(merge_df)


    # dataset에 window 적용
    def windowed_dataset(self, data, label, window_size, batch_size):
        sliced_data = tf.data.Dataset.from_tensor_slices(data)
        sliced_data = sliced_data.window(window_size, shift=1, stride=1, drop_remainder=True)
        sliced_data = sliced_data.flat_map(lambda x: x.batch(window_size))

        sliced_label = tf.data.Dataset.from_tensor_slices(label[window_size:])

        sliced_dataset = tf.data.Dataset.zip((sliced_data, sliced_label))

        return sliced_dataset.batch(batch_size).prefetch(1)





In [6]:
def get_model_BiLSTM(input_shape):
    input = Input(shape=input_shape)
    x = Bidirectional(LSTM(128, return_sequences=True, activation='tanh', dropout=0.3))(input)
    x = Bidirectional(LSTM(64, return_sequences=True, activation='tanh', dropout=0.3))(x)
    x = Bidirectional(LSTM(32, return_sequences=True, activation='tanh', dropout=0.3))(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model

def get_model(input_shape):
    input = Input(shape=input_shape)
    x = LSTM(128, return_sequences=True, activation='tanh', dropout=0.2)(input)
    x = LSTM(64, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = LSTM(32, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model


WINDOW_SIZE = 6
feature_n = 6

Bi_WINDOW_SIZE = 12
Bi_feature_n = 7

Input_shape = (Bi_WINDOW_SIZE, Bi_feature_n)
model = get_model_BiLSTM(Input_shape)
model.load_weights('../checkpoints/BiLSTM/ckeckpointer.ckpt')

model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 12, 7)]           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 12, 256)           139264    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 12, 128)           164352    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 12, 64)            41216     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 344,897
Trainable params: 344,897
Non-trainable params: 0
_____________________________________________________

In [139]:
from easydict import EasyDict
import datetime

options = {
    'ticker' : 'KRW-BTC',
    'interval' : 'minute10',
    'to' : '2021-10-01 00:00',
    #'to' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
    'count' : 2000
}

args = EasyDict(options)

processed_data = Data_preprocess(args)

{'ticker': 'KRW-BTC', 'interval': 'minute10', 'to': '2021-10-01 00:00', 'count': 2000}


In [140]:
print(processed_data.data)

          open      high       low     close    volume     value  avg_price
0     0.792001  0.783565  0.783430  0.785651  0.021673  0.023991   0.784375
1     0.781751  0.776736  0.780859  0.775178  0.018869  0.020873   0.776669
2     0.775067  0.775463  0.768784  0.762366  0.036609  0.040447   0.768216
3     0.762812  0.769097  0.768784  0.770611  0.019964  0.022050   0.765599
4     0.771836  0.763426  0.770125  0.759804  0.034966  0.038623   0.764075
...        ...       ...       ...       ...       ...       ...        ...
1995  0.282977  0.294792  0.286449  0.321635  0.067799  0.069455   0.284001
1996  0.320521  0.293866  0.299866  0.310160  0.098109  0.100504   0.293950
1997  0.309381  0.282870  0.304114  0.296123  0.063826  0.065371   0.285813
1998  0.296234  0.281481  0.299195  0.309381  0.060492  0.061886   0.284231
1999  0.309381  0.288426  0.303667  0.304479  0.028986  0.029689   0.289235

[2000 rows x 7 columns]


In [141]:
print(processed_data.label)

0       0.775178
1       0.762366
2       0.770611
3       0.759804
4       0.776404
          ...   
1995    0.310160
1996    0.296123
1997    0.309381
1998    0.304479
1999    0.000000
Name: after10, Length: 2000, dtype: float64


In [142]:
def inference(lstm_model, processed_data, window_size, batch_size):
    dataset = processed_data.windowed_dataset(processed_data.data, processed_data.label, window_size, batch_size)
    pred = lstm_model.predict(dataset)
    pred = pred[:, 0]
    
    return pred

In [143]:
Bi_WINDOW_SIZE = 12

WINDOW_SIZE = 6
BATCH_SIZE=1

pred = inference(model, processed_data, Bi_WINDOW_SIZE, BATCH_SIZE)


In [144]:
actual = processed_data.label[WINDOW_SIZE:].reset_index(drop=True)

In [145]:
actual

0       0.782977
1       0.788436
2       0.794786
3       0.802250
4       0.800245
          ...   
1989    0.310160
1990    0.296123
1991    0.309381
1992    0.304479
1993    0.000000
Name: after10, Length: 1994, dtype: float64

In [146]:
pred_rate_val = []
act_rate_val = []
accuracy = []

for i in range(0, len(pred) - 2) :
    rate_pred = pred[i + 1]/pred[i]
    pred_rate_val.append(rate_pred)
    
    if rate_pred < 1:
        # Down
        pred_tmp = 0
    else:
        # Up
        pred_tmp = 1
        
    rate_act = actual[i + 2] / actual[i + 1]
    act_rate_val.append(rate_act)
    
    if rate_act < 1:
        # Down
        act_tmp = 0
    else:
        # Up
        act_tmp = 1
        
    if pred_tmp == act_tmp :
        accuracy.append(1)
    else :
        accuracy.append(0)

  app.launch_new_instance()


In [147]:
# 정확도
sum(accuracy) / len(accuracy)

0.5725075528700906

In [177]:
import numpy as np

np_pred_rate_val = np.array(pred_rate_val)
np_accuracy = np.array(accuracy)

In [178]:
import copy

print(np_pred_rate_val)
print(np_accuracy.shape)

a_np_pred_rate_val =  copy.deepcopy(np_pred_rate_val)
b_np_pred_rate_val =  copy.deepcopy(np_pred_rate_val)

[1.0017225  1.0021765  1.0021061  ... 0.99473625 0.99478745 0.9929305 ]
(1986,)


# 몇 퍼센트 올랐는지 구하기 - 절대값 없이

In [179]:
a_np_pred_rate_val = a_np_pred_rate_val - 1
a_np_pred_rate_val = a_np_pred_rate_val * 100
a_np_pred_rate_val

array([ 0.1722455 ,  0.21765232,  0.21060705, ..., -0.5263746 ,
       -0.52125454, -0.7069528 ], dtype=float32)

### 정답이 값 중에서 가장 큰 값과 가장 작은 값

In [151]:
correct = a_np_pred_rate_val[ np_accuracy[:]==1 ]
print(correct.shape)
print(min(correct))
print(max(correct))
correct

(1137,)
-9.773546
12.804341


array([ 0.1722455  ,  0.21765232 ,  0.008273125, ..., -0.25044084 ,
       -0.5263746  , -0.7069528  ], dtype=float32)

### % 구간 정리 - 대체적으로 -10 ~ +10 사이인듯 (최근 데이터 기준)

In [163]:
correct_down = correct[correct[:] < 0]
print(f"{min(correct_down)} ~ {max(correct_down)}")
print(correct_down.shape)

correct_up = correct[correct[:] > 0]
print(f"{min(correct_up)} ~ {max(correct_up)}")
print(correct_up.shape)

-9.77354621887207 ~ -0.0005602836608886719
(548,)
0.0019788742065429688 ~ 12.804341316223145
(589,)


### 각 1퍼센트 기준으로 구간을 나누고 정답 개수 출력

In [194]:
correct_tmp = correct[correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(f"정답 개수 : {correct_1.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_2 = correct_UP[correct_UP[:] > 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 3 ]
print(f"정답 개수 : {correct_2.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_3 = correct_UP[correct_UP[:] > 2 ]
correct_UP = correct_tmp[correct_tmp[:] < 4 ]
print(f"정답 개수 : {correct_3.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_4 = correct_UP[correct_UP[:] > 3 ]
correct_UP = correct_tmp[correct_tmp[:] < 5 ]
print(f"정답 개수 : {correct_4.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_5 = correct_UP[correct_UP[:] > 4 ]
correct_UP = correct_tmp[correct_tmp[:] < 6 ]
print(f"정답 개수 : {correct_5.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_6 = correct_UP[correct_UP[:] > 5 ]
correct_UP = correct_tmp[correct_tmp[:] < 7 ]
print(f"정답 개수 : {correct_6.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_7 = correct_UP[correct_UP[:] > 6 ]
correct_UP = correct_tmp[correct_tmp[:] < 8 ]
print(f"정답 개수 : {correct_7.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_8 = correct_UP[correct_UP[:] > 7 ]
correct_UP = correct_tmp[correct_tmp[:] < 9 ]
print(f"정답 개수 : {correct_8.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_9 = correct_UP[correct_UP[:] > 8 ]
correct_UP = correct_tmp[correct_tmp[:] < 10 ]
print(f"정답 개수 : {correct_9.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_10 = correct_UP[correct_UP[:] > 9 ]
correct_UP = correct_tmp[correct_tmp[:] < 11 ]
print(f"정답 개수 : {correct_10.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_11 = correct_UP[correct_UP[:] > 10 ]
correct_UP = correct_tmp[correct_tmp[:] < 12 ]
print(f"정답 개수 : {correct_11.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

correct_12 = correct_UP[correct_UP[:] > 11 ]
correct_UP = correct_tmp[correct_tmp[:] < 13 ]
print(f"정답 개수 : {correct_12.shape} / {correct_tmp.shape}")
print(correct_UP.shape)
print()

정답 개수 : (390,) / (589,)
(504,)

정답 개수 : (114,) / (589,)
(547,)

정답 개수 : (43,) / (589,)
(560,)

정답 개수 : (13,) / (589,)
(571,)

정답 개수 : (11,) / (589,)
(573,)

정답 개수 : (2,) / (589,)
(575,)

정답 개수 : (2,) / (589,)
(581,)

정답 개수 : (6,) / (589,)
(585,)

정답 개수 : (4,) / (589,)
(586,)

정답 개수 : (1,) / (589,)
(587,)

정답 개수 : (1,) / (589,)
(588,)

정답 개수 : (1,) / (589,)
(589,)



### 구간 별 정답 개수 찾기

In [None]:
correct_tmp = correct[correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(correct_1.shape)
print(correct_UP.shape)
print()

bool_correct = copy.deepcopy(correct_tmp)
bool_correct[ np_accuracy[:]==1 ] = -1
print(bool_correct.shape)
print(min(bool_correct))
print(max(bool_correct))
print(bool_correct)

correct_1[]


### 모든 값들 중 가장 변화율이 큰 값과 작은 값

In [181]:
print(a_np_pred_rate_val.shape)
print(min(a_np_pred_rate_val))
print(max(a_np_pred_rate_val))
a_correct = copy.deepcopy(a_np_pred_rate_val)

(1986,)
-10.377079
12.804341


### % 구간 정리 - 대체적으로 -10 ~ +10 사이인듯 (최근 데이터 기준)

In [182]:
correct_down = a_correct[a_correct[:] < 0]
print(f"{min(correct_down)} ~ {max(correct_down)}")
print(correct_down.shape)

correct_up = a_correct[a_correct[:] > 0]
print(f"{min(correct_up)} ~ {max(correct_up)}")
print(correct_up.shape)

-10.377079010009766 ~ -0.0005602836608886719
(977,)
0.0010251998901367188 ~ 12.804341316223145
(1009,)


### 각 1퍼센트 기준으로 구간을 나누고 개수 출력

In [185]:
correct_tmp = a_correct[a_correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(correct_1.shape)
print(correct_UP.shape)
print()

correct_2 = correct_UP[correct_UP[:] > 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 3 ]
print(correct_2.shape)
print(correct_UP.shape)
print()

correct_3 = correct_UP[correct_UP[:] > 2 ]
correct_UP = correct_tmp[correct_tmp[:] < 4 ]
print(correct_3.shape)
print(correct_UP.shape)
print()

correct_4 = correct_UP[correct_UP[:] > 3 ]
correct_UP = correct_tmp[correct_tmp[:] < 5 ]
print(correct_4.shape)
print(correct_UP.shape)
print()

correct_5 = correct_UP[correct_UP[:] > 4 ]
correct_UP = correct_tmp[correct_tmp[:] < 6 ]
print(correct_5.shape)
print(correct_UP.shape)
print()

correct_6 = correct_UP[correct_UP[:] > 5 ]
correct_UP = correct_tmp[correct_tmp[:] < 7 ]
print(correct_6.shape)
print(correct_UP.shape)
print()

correct_7 = correct_UP[correct_UP[:] > 6 ]
correct_UP = correct_tmp[correct_tmp[:] < 8 ]
print(correct_7.shape)
print(correct_UP.shape)
print()

correct_8 = correct_UP[correct_UP[:] > 7 ]
correct_UP = correct_tmp[correct_tmp[:] < 9 ]
print(correct_8.shape)
print(correct_UP.shape)
print()

correct_9 = correct_UP[correct_UP[:] > 8 ]
correct_UP = correct_tmp[correct_tmp[:] < 10 ]
print(correct_9.shape)
print(correct_UP.shape)
print()

correct_10 = correct_UP[correct_UP[:] > 9 ]
correct_UP = correct_tmp[correct_tmp[:] < 11 ]
print(correct_10.shape)
print(correct_UP.shape)
print()

correct_11 = correct_UP[correct_UP[:] > 10 ]
correct_UP = correct_tmp[correct_tmp[:] < 12 ]
print(correct_11.shape)
print(correct_UP.shape)
print()

correct_12 = correct_UP[correct_UP[:] > 11 ]
correct_UP = correct_tmp[correct_tmp[:] < 13 ]
print(correct_12.shape)
print(correct_UP.shape)
print()

(699,)
(876,)

(177,)
(943,)

(67,)
(965,)

(22,)
(978,)

(13,)
(983,)

(5,)
(991,)

(8,)
(998,)

(7,)
(1003,)

(5,)
(1005,)

(2,)
(1007,)

(2,)
(1008,)

(1,)
(1009,)



### 구간 별 누적그래프 그려보기

### 각 구간 별 정답 개수 체크

In [192]:
correct_tmp = a_correct[a_correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(correct_1.shape)
print(correct_UP.shape)
print()

bool_correct = copy.deepcopy(correct_tmp)
bool_correct[ np_accuracy[:]==1 ] = -1
print(bool_correct.shape)
print(min(bool_correct))
print(max(bool_correct))
print(bool_correct)

correct_1[]


(699,)
(876,)

(1986,)
-10.377079
10.152971
[-1.         -1.          0.21060705 ... -1.         -0.52125454
 -1.        ]


In [46]:
a_np_pred_rate_val[ np_accuracy[:]==1 ] = 0
a_np_pred_rate_val

array([0.       , 0.       , 0.       , ..., 1.0102553, 1.0171678,
       1.0174936], dtype=float32)

In [19]:
b_np_pred_rate_val[ np_accuracy[:]==False ] = 0
b_np_pred_rate_val

array([0.9923874 , 0.99104947, 0.98621666, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [20]:
np_pred_rate_val[ np_accuracy[:]==1 ] = 0

In [21]:
print(992 - np.count_nonzero(np_pred_rate_val))

155


In [22]:
c = 0
for i in np_pred_rate_val:
    if i == 0:
        c += 1
c

1149

In [23]:
c/993

1.1570996978851964