In [1]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pyupbit

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Lambda, Input, GlobalAveragePooling1D, Bidirectional
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
def get_model(input_shape):
    input = Input(shape=input_shape)
    x = LSTM(128, return_sequences=True, activation='tanh', dropout=0.2)(input)
    x = LSTM(64, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = LSTM(32, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model

WINDOW_SIZE = 6
feature_n = 6

Input_shape = (WINDOW_SIZE, feature_n)
model = get_model(Input_shape)
model.load_weights('../checkpoints/ckeckpointer.ckpt')

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6, 6)]            0         
_________________________________________________________________
lstm (LSTM)                  (None, 6, 128)            69120     
_________________________________________________________________
lstm_1 (LSTM)                (None, 6, 64)             49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 6, 32)             12416     
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 130,977
Trainable params: 130,977
Non-trainable params: 0
_______________________________________________________

In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pyupbit
from sklearn.preprocessing import MinMaxScaler

import os
from glob import glob
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



class Data_preprocess():
    def __init__(self, args):
        if not hasattr(args, 'data') or args.data is None:
            print(args)
            self.data, self.label, self.dataset = self.preprocess(
                pyupbit.get_ohlcv(ticker=args.ticker, interval=args.interval, to=args.to, count=args.count))
        else :
            self.data, self.label, self.dataset = self.csv_parsing(args.data)

    def MinMax(self, dataset_df):
        norm = MinMaxScaler()
        norm_dataset = norm.fit_transform(dataset_df)
        return pd.DataFrame(norm_dataset, columns=list(dataset_df.columns))

    def add_after10(self, dataset_df):
        after10 = np.zeros_like(self.norm_dataset['close'])
        for i in range(len(dataset_df['close']) - 1):
            after10[i] = dataset_df['close'][i + 1]
        return after10

    def drop_feature(self, dataset_df):
        # index(시간) 제거
        dataset_df = dataset_df.reset_index(drop=True)
        # value 제거
        dataset_df = dataset_df.drop(columns=['value'])
        return dataset_df

    def add_avgPrice(self, dataset_df):
        return (dataset_df['high'] + dataset_df['low'] +
                dataset_df['open'] + dataset_df['close']) // 4

    def preprocess(self, dataset, latest=False):

        # drop feature
        #dataset_df = self.drop_feature(dataset)
        dataset_df = dataset

        # avg_price 추가
        dataset_df['avg_price'] = self.add_avgPrice(dataset_df)

        if latest == True:
            # 가장 예전 데이터 삭제 - norm이랑 original 둘 다 적용
            self.dataset = self.dataset.drop([self.dataset.index[0]]).drop(columns=['after10'])
            self.norm_dataset = self.norm_dataset.drop([self.norm_dataset.index[0]])

            # ori dataset에 추가
            self.dataset = pd.concat([self.dataset, dataset_df])
            self.dataset = self.dataset.reset_index(drop=True)

            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(self.dataset)

            # after10 추가
            self.dataset['after10'] = self.add_after10(self.dataset)


        else:
            # min max 정규화 (MinMaxScaler) 적용
            self.norm_dataset = self.MinMax(dataset_df)

            # after10 추가
            dataset_df['after10'] = self.add_after10(dataset_df)

        # 예측될 값(label)인 10분 후 가격
        self.norm_dataset['after10'] = self.add_after10(self.norm_dataset)

        return self.norm_dataset.drop(columns=['after10']), self.norm_dataset['after10'], dataset_df

    def csv_parsing(self, data_path):
        merge_df = pd.DataFrame()
        data_folders = glob(os.path.join(data_path, '*'))

        for data_folder in tqdm(data_folders):
            data_csvs = glob(os.path.join(data_folder,'*.csv'))

            for data_csv in data_csvs :
                csv_df = pd.read_csv(data_csv).drop(columns=["Unnamed: 0"])
                merge_df = pd.concat([merge_df, csv_df], ignore_index=True)

        return self.preprocess(merge_df)


    # dataset에 window 적용
    def windowed_dataset(self, data, label, window_size, batch_size):
        sliced_data = tf.data.Dataset.from_tensor_slices(data)
        sliced_data = sliced_data.window(window_size, shift=1, stride=1, drop_remainder=True)
        sliced_data = sliced_data.flat_map(lambda x: x.batch(window_size))

        sliced_label = tf.data.Dataset.from_tensor_slices(label[window_size:])

        sliced_dataset = tf.data.Dataset.zip((sliced_data, sliced_label))

        return sliced_dataset.batch(batch_size).prefetch(1)





In [5]:
def get_model_BiLSTM(input_shape):
    input = Input(shape=input_shape)
    x = Bidirectional(LSTM(128, return_sequences=True, activation='tanh', dropout=0.3))(input)
    x = Bidirectional(LSTM(64, return_sequences=True, activation='tanh', dropout=0.3))(x)
    x = Bidirectional(LSTM(32, return_sequences=True, activation='tanh', dropout=0.3))(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model

def get_model(input_shape):
    input = Input(shape=input_shape)
    x = LSTM(128, return_sequences=True, activation='tanh', dropout=0.2)(input)
    x = LSTM(64, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = LSTM(32, return_sequences=True, activation='tanh', dropout=0.2)(x)
    x = GlobalAveragePooling1D()(x)
    output = Dense(1)(x)
    model = Model(input, output)
    return model


WINDOW_SIZE = 6
feature_n = 6

Bi_WINDOW_SIZE = 12
Bi_feature_n = 7

Input_shape = (Bi_WINDOW_SIZE, Bi_feature_n)
model = get_model_BiLSTM(Input_shape)
model.load_weights('../checkpoints/BiLSTM/ckeckpointer.ckpt')

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 12, 7)]           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 12, 256)           139264    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 12, 128)           164352    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 12, 64)            41216     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 344,897
Trainable params: 344,897
Non-trainable params: 0
_____________________________________________________

In [85]:
from easydict import EasyDict
import datetime

options = {
    'ticker' : 'KRW-BTC',
    'interval' : 'minute10',
#     'to' : '2021-10-01 00:00',
    'to' : datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
    'count' : 2000
}

args = EasyDict(options)

processed_data = Data_preprocess(args)

{'ticker': 'KRW-BTC', 'interval': 'minute10', 'to': '2021-11-24 00:40', 'count': 2000}


In [86]:
print(processed_data.data)

          open      high       low     close    volume     value  avg_price
0     0.850951  0.825273  0.847061  0.850486  0.060696  0.066349   0.850742
1     0.852688  0.826131  0.854009  0.842430  0.041252  0.045141   0.851139
2     0.844241  0.822777  0.851589  0.837454  0.069364  0.075852   0.846293
3     0.837531  0.843916  0.853931  0.853566  0.069715  0.076439   0.854635
4     0.853635  0.838534  0.856741  0.842035  0.029963  0.032841   0.855131
...        ...       ...       ...       ...       ...       ...        ...
1995  0.134839  0.151326  0.157936  0.169734  0.101392  0.098817   0.152476
1996  0.170601  0.163027  0.168866  0.166338  0.096907  0.094651   0.166379
1997  0.167048  0.155850  0.172613  0.151568  0.061285  0.059795   0.160897
1998  0.151970  0.143682  0.170583  0.150541  0.061014  0.059442   0.153230
1999  0.157022  0.143058  0.160981  0.150462  0.054595  0.053174   0.151880

[2000 rows x 7 columns]


In [87]:
print(processed_data.label)

0       0.842430
1       0.837454
2       0.853566
3       0.842035
4       0.838243
          ...   
1995    0.166338
1996    0.151568
1997    0.150541
1998    0.150462
1999    0.000000
Name: after10, Length: 2000, dtype: float64


In [88]:
def inference(lstm_model, processed_data, window_size, batch_size):
    dataset = processed_data.windowed_dataset(processed_data.data, processed_data.label, window_size, batch_size)
    pred = lstm_model.predict(dataset)
    pred = pred[:, 0]
    
    return pred

In [89]:
Bi_WINDOW_SIZE = 12

WINDOW_SIZE = 6
BATCH_SIZE=1

pred = inference(model, processed_data, Bi_WINDOW_SIZE, BATCH_SIZE)


In [90]:
actual = processed_data.label[WINDOW_SIZE:].reset_index(drop=True)

In [91]:
actual

0       0.838875
1       0.847484
2       0.847879
3       0.848274
4       0.874654
          ...   
1989    0.166338
1990    0.151568
1991    0.150541
1992    0.150462
1993    0.000000
Name: after10, Length: 1994, dtype: float64

In [92]:
pred_rate_val = []
act_rate_val = []
accuracy = []

for i in range(0, len(pred) - 2) :
    rate_pred = pred[i + 1]/pred[i]
    pred_rate_val.append(rate_pred)
    
    if rate_pred < 1:
        # Down
        pred_tmp = 0
    else:
        # Up
        pred_tmp = 1
        
    rate_act = actual[i + 2] / actual[i + 1]
    act_rate_val.append(rate_act)
    
    if rate_act < 1:
        # Down
        act_tmp = 0
    else:
        # Up
        act_tmp = 1
        
    if pred_tmp == act_tmp :
        accuracy.append(1)
    else :
        accuracy.append(0)

  app.launch_new_instance()


In [93]:
# 정확도
sum(accuracy) / len(accuracy)

0.5936555891238671

In [94]:
import numpy as np

np_pred_rate_val = np.array(pred_rate_val)
np_accuracy = np.array(accuracy)

In [95]:
import copy

print(np_pred_rate_val)
print(np_accuracy.shape)

a_np_pred_rate_val =  copy.deepcopy(np_pred_rate_val)
b_np_pred_rate_val =  copy.deepcopy(np_pred_rate_val)

[1.0018128 1.001249  1.0015295 ... 1.0224224 1.0245037 1.0220371]
(1986,)


# 몇 퍼센트 올랐는지 구하기 - 절대값 없이

In [96]:
a_np_pred_rate_val = a_np_pred_rate_val - 1
a_np_pred_rate_val = a_np_pred_rate_val * 100
a_np_pred_rate_val

array([0.18128157, 0.12489557, 0.15294552, ..., 2.2422433 , 2.4503708 ,
       2.2037148 ], dtype=float32)

### 정답이 값 중에서 가장 큰 값과 가장 작은 값

In [97]:
correct = a_np_pred_rate_val[ np_accuracy[:]==1 ]
print(correct.shape)
print(min(correct))
print(max(correct))
correct

(1179,)
-10.621738
9.616459


array([0.18128157, 0.12489557, 0.15294552, ..., 1.9035697 , 2.2422433 ,
       2.2037148 ], dtype=float32)

### % 구간 정리 - 대체적으로 -10 ~ +10 사이인듯 (최근 데이터 기준)

In [98]:
correct_down = correct[correct[:] < 0]
print(f"{min(correct_down)} ~ {max(correct_down)}")
print(correct_down.shape)

correct_up = correct[correct[:] > 0]
print(f"{min(correct_up)} ~ {max(correct_up)}")
print(correct_up.shape)

-10.62173843383789 ~ -0.0010371208190917969
(639,)
0.0005841255187988281 ~ 9.616458892822266
(540,)


### 각 1퍼센트 기준으로 구간을 나누고 정답 개수 출력
- 0 ~ 1 사이에 모든 정답의 77%가 있음 (2021-11-23 23:55 기준)
- 0 ~ 2 사이에 모든 정답의 87%가 있음 (2021-11-23 23:55 기준)

In [99]:
correct_tmp = correct[correct[:] > 0 ]
tmp_shape = 0
for i in range(1, 11) :
    correct_1 = correct_tmp[correct_tmp[:] < (i) ]
    correct_UP = correct_tmp[correct_tmp[:] < (i + 1) ]
    print(f"{i-1} ~ {i} : {correct_1.shape[0] - tmp_shape}")
    tmp_shape = correct_1.shape[0]
    print(f"누적 개수 : {tmp_shape}")
    

0 ~ 1 : 416
누적 개수 : 416
1 ~ 2 : 54
누적 개수 : 470
2 ~ 3 : 32
누적 개수 : 502
3 ~ 4 : 21
누적 개수 : 523
4 ~ 5 : 5
누적 개수 : 528
5 ~ 6 : 3
누적 개수 : 531
6 ~ 7 : 4
누적 개수 : 535
7 ~ 8 : 3
누적 개수 : 538
8 ~ 9 : 1
누적 개수 : 539
9 ~ 10 : 1
누적 개수 : 540


### 구간 별 개수 찾기
- 0 ~ 1 사이에 모든 값들 기준 78%가 있음. (2021-11-23 23:55 기준)

In [65]:
all_data = copy.deepcopy(a_np_pred_rate_val)
correct_tmp = all_data[all_data[:] > 0 ]
print(f"total = {correct_tmp.shape}")
tmp_shape = 0
for i in range(1 , 50) :
    correct_1 = correct_tmp[correct_tmp[:] < (i/10) ]
    correct_UP = correct_tmp[correct_tmp[:] < (i/10 + 0.1) ]
    print(f"{i/10 - 0.1:.1f} ~ {i/10} : {correct_1.shape[0] - tmp_shape}")
    tmp_shape = correct_1.shape[0]
    print(tmp_shape)

    
# correct_1 = correct_tmp[correct_tmp[:] < 1 ]
# correct_UP = correct_tmp[correct_tmp[:] < 2 ]
# print(correct_1.shape)
# print(correct_UP.shape)


total = (921,)
0.0 ~ 0.1 : 193
193
0.1 ~ 0.2 : 160
353
0.2 ~ 0.3 : 92
445
0.3 ~ 0.4 : 64
509
0.4 ~ 0.5 : 44
553
0.5 ~ 0.6 : 53
606
0.6 ~ 0.7 : 44
650
0.7 ~ 0.8 : 27
677
0.8 ~ 0.9 : 27
704
0.9 ~ 1.0 : 21
725
1.0 ~ 1.1 : 16
741
1.1 ~ 1.2 : 9
750
1.2 ~ 1.3 : 10
760
1.3 ~ 1.4 : 9
769
1.4 ~ 1.5 : 13
782
1.5 ~ 1.6 : 10
792
1.6 ~ 1.7 : 3
795
1.7 ~ 1.8 : 5
800
1.8 ~ 1.9 : 4
804
1.9 ~ 2.0 : 5
809
2.0 ~ 2.1 : 4
813
2.1 ~ 2.2 : 5
818
2.2 ~ 2.3 : 7
825
2.3 ~ 2.4 : 3
828
2.4 ~ 2.5 : 10
838
2.5 ~ 2.6 : 4
842
2.6 ~ 2.7 : 3
845
2.7 ~ 2.8 : 6
851
2.8 ~ 2.9 : 3
854
2.9 ~ 3.0 : 6
860
3.0 ~ 3.1 : 3
863
3.1 ~ 3.2 : 6
869
3.2 ~ 3.3 : 2
871
3.3 ~ 3.4 : 2
873
3.4 ~ 3.5 : 2
875
3.5 ~ 3.6 : 4
879
3.6 ~ 3.7 : 2
881
3.7 ~ 3.8 : 3
884
3.8 ~ 3.9 : 7
891
3.9 ~ 4.0 : 2
893
4.0 ~ 4.1 : 1
894
4.1 ~ 4.2 : 1
895
4.2 ~ 4.3 : 2
897
4.3 ~ 4.4 : 0
897
4.4 ~ 4.5 : 0
897
4.5 ~ 4.6 : 1
898
4.6 ~ 4.7 : 0
898
4.7 ~ 4.8 : 2
900
4.8 ~ 4.9 : 1
901


### 모든 값들 중 가장 변화율이 큰 값과 작은 값

In [37]:
print(a_np_pred_rate_val.shape)
print(min(a_np_pred_rate_val))
print(max(a_np_pred_rate_val))
a_correct = copy.deepcopy(a_np_pred_rate_val)

(1986,)
-11.454636
10.437751


### % 구간 정리 - 대체적으로 -10 ~ +10 사이인듯 (최근 데이터 기준)

In [38]:
correct_down = a_correct[a_correct[:] < 0]
print(f"{min(correct_down)} ~ {max(correct_down)}")
print(correct_down.shape)

correct_up = a_correct[a_correct[:] > 0]
print(f"{min(correct_up)} ~ {max(correct_up)}")
print(correct_up.shape)

-11.454635620117188 ~ -0.0010371208190917969
(1065,)
0.0003457069396972656 ~ 10.437750816345215
(921,)


### 각 1퍼센트 기준으로 구간을 나누고 개수 출력

In [39]:
correct_tmp = a_correct[a_correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(correct_1.shape)
print(correct_UP.shape)
print()

correct_2 = correct_UP[correct_UP[:] > 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 3 ]
print(correct_2.shape)
print(correct_UP.shape)
print()

correct_3 = correct_UP[correct_UP[:] > 2 ]
correct_UP = correct_tmp[correct_tmp[:] < 4 ]
print(correct_3.shape)
print(correct_UP.shape)
print()

correct_4 = correct_UP[correct_UP[:] > 3 ]
correct_UP = correct_tmp[correct_tmp[:] < 5 ]
print(correct_4.shape)
print(correct_UP.shape)
print()

correct_5 = correct_UP[correct_UP[:] > 4 ]
correct_UP = correct_tmp[correct_tmp[:] < 6 ]
print(correct_5.shape)
print(correct_UP.shape)
print()

correct_6 = correct_UP[correct_UP[:] > 5 ]
correct_UP = correct_tmp[correct_tmp[:] < 7 ]
print(correct_6.shape)
print(correct_UP.shape)
print()

correct_7 = correct_UP[correct_UP[:] > 6 ]
correct_UP = correct_tmp[correct_tmp[:] < 8 ]
print(correct_7.shape)
print(correct_UP.shape)
print()

correct_8 = correct_UP[correct_UP[:] > 7 ]
correct_UP = correct_tmp[correct_tmp[:] < 9 ]
print(correct_8.shape)
print(correct_UP.shape)
print()

correct_9 = correct_UP[correct_UP[:] > 8 ]
correct_UP = correct_tmp[correct_tmp[:] < 10 ]
print(correct_9.shape)
print(correct_UP.shape)
print()

correct_10 = correct_UP[correct_UP[:] > 9 ]
correct_UP = correct_tmp[correct_tmp[:] < 11 ]
print(correct_10.shape)
print(correct_UP.shape)
print()

correct_11 = correct_UP[correct_UP[:] > 10 ]
correct_UP = correct_tmp[correct_tmp[:] < 12 ]
print(correct_11.shape)
print(correct_UP.shape)
print()

correct_12 = correct_UP[correct_UP[:] > 11 ]
correct_UP = correct_tmp[correct_tmp[:] < 13 ]
print(correct_12.shape)
print(correct_UP.shape)
print()

(725,)
(809,)

(84,)
(860,)

(51,)
(893,)

(33,)
(902,)

(9,)
(905,)

(3,)
(910,)

(5,)
(914,)

(4,)
(918,)

(4,)
(920,)

(2,)
(921,)

(1,)
(921,)

(0,)
(921,)



### 구간 별 누적그래프 그려보기

### 각 구간 별 정답 개수 체크

In [192]:
correct_tmp = a_correct[a_correct[:] > 0 ]

correct_1 = correct_tmp[correct_tmp[:] < 1 ]
correct_UP = correct_tmp[correct_tmp[:] < 2 ]
print(correct_1.shape)
print(correct_UP.shape)
print()

bool_correct = copy.deepcopy(correct_tmp)
bool_correct[ np_accuracy[:]==1 ] = -1
print(bool_correct.shape)
print(min(bool_correct))
print(max(bool_correct))
print(bool_correct)

correct_1[]


(699,)
(876,)

(1986,)
-10.377079
10.152971
[-1.         -1.          0.21060705 ... -1.         -0.52125454
 -1.        ]


In [46]:
a_np_pred_rate_val[ np_accuracy[:]==1 ] = 0
a_np_pred_rate_val

array([0.       , 0.       , 0.       , ..., 1.0102553, 1.0171678,
       1.0174936], dtype=float32)

In [19]:
b_np_pred_rate_val[ np_accuracy[:]==False ] = 0
b_np_pred_rate_val

array([0.9923874 , 0.99104947, 0.98621666, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [20]:
np_pred_rate_val[ np_accuracy[:]==1 ] = 0

In [21]:
print(992 - np.count_nonzero(np_pred_rate_val))

155


In [22]:
c = 0
for i in np_pred_rate_val:
    if i == 0:
        c += 1
c

1149

In [23]:
c/993

1.1570996978851964