## sentdex - 딥러닝 Pt.11

<h3> 목표 : 드디어 예측!</h3>
<hr>

In [1]:
import warnings

warnings.filterwarnings(action = "ignore")

In [2]:
import tensorflow as tf

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.6)
sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))

In [3]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
# import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization #DuDNNLSTM
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# 60분을 통해 3분 (미래) 예측

SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
def preprocess_df(df):
    
    # "future" 열은 이제 필요 없음
    df = df.drop("future", axis = 1)
    
    # scaling
    for col in df.columns:
        if col != "target":
            
            df[col] = df[col].pct_change()
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)
    
    # 혹시 모르니까..
    df.dropna(inplace = True)
    
    
    sequential_data = []
    prev_mins = deque(maxlen = SEQ_LEN)
    
    for i in df.values:  # df.values: df의 각 행
        prev_mins.append([n for n in i[:-1]])  # why i[:-1]? : target class를 미포함시키기 위해서
        
        if len(prev_mins) == SEQ_LEN:
            sequential_data.append([np.array(prev_mins), i[-1]])  # 지난 60일 ( (i)일 기준 ) + _일의 타겟값(i)
            
    random.shuffle(sequential_data)
    
    # 밸런싱합시다!
    # 사야될 날과 안되는 날을 구분
    buys, sells = [], []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    # 섞어섞어
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    # 최소 사이즈에 맞춰서 둘 다 잘라버림 - 밸런싱
    buys = buys[:lower]
    sells = sells[:lower]
    
    # 연쇄
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    # X와 y로 스플릿
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), y




main_df = pd.DataFrame()

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]
for ratio in ratios:
    dataset = f"crypto_data/{ratio}.csv"
    
    df = pd.read_csv(dataset,
                     names = ["time", "low", "high", "open", "close", "volume"])
    df.rename(columns = {"close" : f"{ratio}_close", "volume" : f"{ratio}_volume"}, inplace = True)
    
    df.set_index("time", inplace = True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
# "future" 값은 사실 "close"가 3일치 밀린 (shifted) 값이다!
main_df["future"] = main_df[ f"{RATIO_TO_PREDICT}_close" ].shift(-FUTURE_PERIOD_PREDICT)        

# 분류합시다 (ML은 아니지만)
main_df['target'] = list(map(classify, main_df[ f"{RATIO_TO_PREDICT}_close"], main_df["future"]))




times = sorted(main_df.index.values)
last_5pct = times[-int(0.05 * len(times))] # threshold

validation_main_df = main_df[ (main_df.index >= last_5pct)] # validation (5%)
main_df = main_df[ (main_df.index < last_5pct)] # the rest

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 69188 validation: 3062
Dont buys: 34594, buys: 34594
VALIDATION Dont buys: 1531, buys: 1531


### 텐서보드 이용법

1. **모델 피팅 코드 (아래) 돌리기**
2. **돌리는 동안 cmd 열기**
3. **" tensorboard --logdir="DL_Recap/sentdex/logs/" (디렉토리 위치는 컴퓨터 환경에 따라 바뀔 수 있음) 커맨드 입력**
4. **cmd 켜놓은 상태로 http://localhost:6006에 접속**

**로그를 볼 수 없는 오류는?**    https://github.com/tensorflow/tensorboard/issues/3117

# #CuDNNLSTM 이용

In [5]:
from tensorflow.keras.layers import CuDNNLSTM

model = Sequential()
model.add( CuDNNLSTM(128, input_shape = (train_x.shape[1:]), return_sequences = True) )
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add( CuDNNLSTM(128, input_shape = (train_x.shape[1:]), return_sequences = True) )
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add( CuDNNLSTM(128, input_shape = (train_x.shape[1:])) ) # 다음에 오는게 Dense layer라서 return_sequences는 필요 없음
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation = "softmax"))  # 둘 중 하나! (일반 classification)

opt = tf.keras.optimizers.Adam(lr = 0.001, decay = 1e-6)

model.compile(loss = "sparse_categorical_crossentropy",
              optimizer = opt,
              metrics = ["accuracy"])



tensorboard = TensorBoard(log_dir = f"logs\\{NAME}")

filepath = "RNN_Final-{epoch:02d}-{val_acc:3f}" # 에폭에 대해서 에폭-validation 정확도를 저장할 유니크한 저장 경로
checkpoint = ModelCheckpoint("models\\{}.model".format(filepath, moniter = "val_acc",
                                                     verbose = 1, save_best_only = True, mode = "max")) # 최고들만 저장

history = model.fit(
    train_x, train_y,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_data = (validation_x, validation_y),
    callbacks = [tensorboard, checkpoint])

Train on 69188 samples, validate on 3062 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# #LSTM 이용

In [4]:
model = Sequential()
model.add( LSTM(128, input_shape = (train_x.shape[1:]),
                activation = "relu", return_sequences = True) )
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add( LSTM(128, input_shape = (train_x.shape[1:]),
                activation = "relu", return_sequences = True) )
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add( LSTM(128, input_shape = (train_x.shape[1:]),
                activation = "relu") ) # 다음에 오는게 Dense layer라서 return_sequences는 필요 없음
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation = "softmax"))  # 둘 중 하나! (일반 classification)

opt = tf.keras.optimizers.Adam(lr = 0.001, decay = 1e-6)

model.compile(loss = "sparse_categorical_crossentropy",
              optimizer = opt,
              metrics = ["accuracy"])



tensorboard = TensorBoard(log_dir = f"logs\\{NAME}")

filepath = "RNN_Final-{epoch:02d}-{val_acc:3f}" # 에폭에 대해서 에폭-validation 정확도를 저장할 유니크한 저장 경로
checkpoint = ModelCheckpoint("models\\{}.model".format(filepath, moniter = "val_acc",
                                                     verbose = 1, save_best_only = True, mode = "max")) # 최고들만 저장

history = model.fit(
    train_x, train_y,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_data = (validation_x, validation_y),
    callbacks = [tensorboard, checkpoint])




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 69188 samples, validate on 3062 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10

KeyboardInterrupt: 