# --> Importations

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random
import time

# --> Global variables

In [2]:
SEQ_LEN = 60                 #window size : 60 minutes 
FUTURE_PERIOD_PREDICT = 3    #predict : 3 minutes
RATIO_TO_PREDICT = "BTC-USD" #predict : BTC-USD price
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

# --> Définition des classes

In [3]:
def classify(current, future):
    if float(future) > float(current):
        return 1 #1 veut dire qu'on doit acheter
    else:
        return 0 #0 veut dire qu'on doit vendre

# --> Data processing

In [4]:
main_df = pd.DataFrame()
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]
for ratio in ratios:
    ratio = ratio.split('.csv')[0] #On enleve le .csv
    dataset = f"crypto_data/{ratio}.csv"
    df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"])
    #print(ratio)
    #print(df.head(3))
    df.rename(columns={"close": f"{ratio}_close",
                       "volume": f"{ratio}_volume"},
                       inplace=True
                        )
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    #print('=====================================================================================')
    #print(df.head(5))
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)        
        
main_df.fillna(method="ffill", inplace=True) #Si il y a des "trous" dans les donnees, on utilise les anciennes donnees
main_df.dropna(inplace=True)
        
print(main_df.head(3))

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  


In [5]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
print(main_df[[f"{RATIO_TO_PREDICT}_close", "future"]].head(3))

            BTC-USD_close       future
time                                  
1528968720    6487.379883  6479.979980
1528968780    6479.410156  6480.000000
1528968840    6479.410156  6477.220215


In [6]:
main_df['target'] = list(map(classify, 
                             main_df[f'{RATIO_TO_PREDICT}_close'], 
                             main_df['future']))
print(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head(10))
#Ici un "1" en target signifie que 3 lignes plus tard le prix a augmente
main_df.dropna(inplace=True)

            BTC-USD_close       future  target
time                                          
1528968720    6487.379883  6479.979980       0
1528968780    6479.410156  6480.000000       1
1528968840    6479.410156  6477.220215       0
1528968900    6479.979980  6480.000000       1
1528968960    6480.000000  6479.990234       0
1528969020    6477.220215  6478.660156       1
1528969080    6480.000000  6478.660156       0
1528969140    6479.990234  6479.339844       0
1528969200    6478.660156  6479.350098       1
1528969260    6478.660156  6479.990234       1


# --> Normalize data

In [7]:
times = sorted(main_df.index.values)
last_5_percent = sorted(main_df.index.values)[-int(0.05*len(times))]
print(last_5_percent)

1534921920


In [8]:
validation_main_df = main_df[(main_df.index >= last_5_percent)]
main_df = main_df[(main_df.index < last_5_percent)]
print(validation_main_df)
print("========================================================================================")
print(main_df)

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1534921920    6686.250000        2.678847      57.509998        6.070000   
1534921980    6686.250000        0.220156      57.509998       15.697691   
1534922040    6685.000000        6.401611      57.509998        0.212400   
1534922100    6684.500000        0.969366      57.509998       66.463028   
1534922160    6684.500000        0.611018      57.509998        3.616516   
...                   ...             ...            ...             ...   
1535214780    6708.379883        0.975295      58.009998       14.458084   
1535214840    6710.089844        1.293573      58.009998       93.464951   
1535214900    6712.990234        2.330975      58.020000        0.823356   
1535214960    6713.140137        0.769891      58.020000        6.434783   
1535215020    6714.520020        1.002652      58.009998        7.301921   

           

In [9]:
def preprocess_df(df):
    df = df.drop("future", 1)                             #Il faut enlever le future sinon le NN va s'en servir
    for col in df.columns:                                
        if col != "target":                               #On normalize et scale tous sauf les targets qui restent 0 ou 1
            df[col] = df[col].pct_change()                #Normalize la data
            df.dropna(inplace=True)                       #Supprime les bugs
            df[col] = preprocessing.scale(df[col].values) #Preprocessing scale les valeurs
    df.dropna(inplace=True)                               
    
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)                     #Continuer d'ajouter de la data jusqu'au "maxlen" puis pop out les vieux items
#     print(df.head(10))
#     print("===================================================================")
#     for c in df.columns:
#         print(c)
    for i in df.values:                                   #On prends les values donc il n'y a plus le temps mais il y a toujours les targets
        prev_days.append([n for n in i[:-1]])             #"n for n" sont les colonnes, sans le dernier i qui correspond aux targets
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
    random.shuffle(sequential_data)                       #Melange les donnees 
    
    buys = []
    sells = []
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    random.shuffle(buys)
    random.shuffle(sells)
    lower = min(len(buys), len(sells))
    buys = buys[:lower]                                   #Balance les donnes
    sells = sells[:lower]
    sequential_data = buys + sells
    random.shuffle(sequential_data)                       #Le model doit avoir des buys et sells aleatoires pour l'entrainement
    
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    print(y[:5])
    print("=======================================================")
    return np.array(X), y

In [10]:
print(main_df.shape)
print(validation_main_df.shape)
train_x, train_y = preprocess_df(main_df)
valid_x, valid_y = preprocess_df(validation_main_df)
print(train_x[0].shape, len(train_y))
print(valid_x[0].shape, len(valid_y))

(92834, 10)
(4886, 10)
[1.0, 0.0, 1.0, 1.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 1.0]
(60, 8) 83156
(60, 8) 4478


In [11]:
print(f"train data: {len(train_x)} validation : {len(valid_x)}")
print(f"dont buy: {train_y.count(0)} buy : {train_y.count(1)}")
print(f"validation dont buy: {valid_y.count(0)} validation buy : {valid_y.count(1)}")

train data: 83156 validation : 4478
dont buy: 41578 buy : 41578
validation dont buy: 2239 validation buy : 2239


# --> Building model

>Ce model est un RNN, LSTM qui fait de la prediction de prix de plusieurs cryptomonnaies.

In [None]:
model = Sequential()

#On ne specifie pas la fonction d'activation. Cela appelle le LSTM de CuDNN, qui utilise sa propre fonction d'activation.
#CuDNNLSTM utilise la fonction d'activation tanh
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(lr=1e-3, decay=1e-6)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer = optimizer,
    metrics = ["accuracy"]
    )

tensorboard = TensorBoard(log_dir=f"logs/{NAME}") #On recupere les infos avec "tensorboard --logdir=logs/"

file_path = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  #Nom de fichier unique avec le nombre d'epochs et l'accuracy du validation set
checkpoint = ModelCheckpoint("models/{}.model".format(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) #Save les meilleurs NN

model.summary()

history = model.fit(np.asarray(train_x),  np.asarray(train_y), 
                    batch_size = BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(np.asarray(valid_x), np.asarray(valid_y),
                    callbacks=[tensorboard, checkpoint])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 60, 128)           70144     
_________________________________________________________________
dropout (Dropout)            (None, 60, 128)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 60, 128)           512       
_________________________________________________________________
lstm_1 (LSTM)                (None, 60, 128)           131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 128)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 60, 128)           512       
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               1

# --> Evaluation du model et sauveguarde apres entrainement

In [None]:
score = model.evaluate(valid_x, valid_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
model.save("models/{}".format(NAME))