## Dataset Preprocessing

we will use the BTC-USD, LTC-USD, ETH-USD and BCH-USD datasets to train our model, however it is necessary to perform
data preprocessing before actually feed our model 

In [17]:
import pandas as pd

# Test if we can actually read the data from the dataset folder:

dataset = pd.read_csv("Datasets/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
print(dataset.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [18]:
# Define a Function to select only "close" and "volume" columns
# then merge in a unique dataframe
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]
path = "Datasets/"

dataset = pd.DataFrame()

def merge_datasets(ratios, path, dataset_merge):
    
    # Read datasets from the path
    for ratio in ratios:
        #Set the path to each cryptocurency dataset and read them:
        dataset_path = path+f'{ratio}.csv'
        dataset = pd.read_csv(dataset_path, names=['time', f'{ratio}_low', f'{ratio}_high', f'{ratio}_open', f'{ratio}_close', f'{ratio}_volume'])
        
        # take only "close" and "volume" columns from each crypto dataset:
        dataset.set_index("time", inplace=True) #since time is a common value for each dataset, we define it as the index 
        dataset_close_vol = dataset[[f'{ratio}_close', f'{ratio}_volume']] # take close and volume
        
        # merge values:
        if len(dataset_merge)==0:
            dataset_merge = dataset_close_vol
        else:
            dataset_merge = dataset_merge.join(dataset_close_vol)

    dataset_merge.fillna(method="ffill", inplace=True) #fill gaps with the previous know value
    dataset_merge.dropna(inplace=True)
    return dataset_merge
    
        

In [19]:
# Verify if the function works as expected:
main_dataset = merge_datasets(ratios,path,dataset)
print(main_dataset.head())


            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000       77.355759  
1528968960     86

as we can see, we already sucessfully merge the data from the different crypocurrencies datasets, know we need to perform 
define our inputs and targets, and then apply some normalization to the input data

## Target Definition

Since RNN uses sequential data, we need to prepare our target vector considering the length of the sequence that we will use 
as input, the cryptocurrency that we want to predict and finally the temporary window behaviour (up/down) that we want to predict

In [20]:
# Variables used to create a target vector
SEQUENCE_LENGTH = 60 # Take the previous 60 instances to make predictions
FUTURE_PERIOD_PREDICTION = 3 # we want to predict the cryptocurrency behaviour in a period of 3 min
RATIO_TO_PREDICT = "LTC-USD"

now, it is necessary to define a function that take the current value and the future value of the crypto and then
set a binary value of 1 when we should buy and 0 when we have to sell our asset

In [21]:
def buy_sell_classification(current_value, future_value):
    if float(future_value) > float(current_value):
        return 1
    else:
        return 0

In [22]:
def build_target_vector(dataset, ratio_to_predict, future_period_prediction):
    # create a new column with future values:
    dataset[f'future_{ratio_to_predict}'] = dataset[f'{ratio_to_predict}_close'].shift(-future_period_prediction)
    
    # create a target column
    dataset[f'target_{ratio_to_predict}'] = list(map(buy_sell_classification, dataset[f'{ratio_to_predict}_close'], dataset[f'future_{ratio_to_predict}']))
    
    return dataset

In [23]:
main_dataset_with_target = build_target_vector(main_dataset, RATIO_TO_PREDICT, FUTURE_PERIOD_PREDICTION)

# Verify values:
print(main_dataset_with_target.shape)
print(main_dataset_with_target[[f'{RATIO_TO_PREDICT}_close', f'future_{RATIO_TO_PREDICT}', f'target_{RATIO_TO_PREDICT}']].head(10))
print(main_dataset_with_target.head())

(97723, 10)
            LTC-USD_close  future_LTC-USD  target_LTC-USD
time                                                     
1528968720      96.660004       96.389999               0
1528968780      96.570000       96.519997               0
1528968840      96.500000       96.440002               0
1528968900      96.389999       96.470001               1
1528968960      96.519997       96.400002               0
1528969020      96.440002       96.400002               0
1528969080      96.470001       96.400002               0
1528969140      96.400002       96.400002               0
1528969200      96.400002       96.400002               0
1528969260      96.400002       96.449997               1
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       7

## Training Data, Test Data and Normalization

Since we are trying to make predictions based on temporal data, it is necessary to split and sort our dataset into little sequential chuncks of data and apply some normalization (because the range value of all the different assets is way too different between cryptocurrencies)

In [24]:
# Split our dataset into training and validation sets
def split_dataset(dataset):
    # take time as index and sort it
    times = sorted(dataset.index.values)
    last_percentage = sorted(dataset.index.values)[-int(0.25*len(times))]
    # use index to take our validation and training data
    validation_dataset = dataset[(dataset.index >= last_percentage)]
    training_dataset = dataset[(dataset.index < last_percentage)]
    
    return training_dataset, validation_dataset

In [25]:
#verify function:
train_dataset, validation_dataset = split_dataset(main_dataset_with_target)
print(main_dataset_with_target.shape)
print(train_dataset.shape)
print(validation_dataset.shape)

(97723, 10)
(73293, 10)
(24430, 10)


In [26]:
# Lets preprocess the data:
from sklearn import preprocessing
from collections import deque
import numpy as np
import random

def preprocess_dataset(df):
    df = df.drop(f'future_{RATIO_TO_PREDICT}', 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != f'target_{RATIO_TO_PREDICT}':  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQUENCE_LENGTH)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQUENCE_LENGTH:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!


In [27]:
X_train, y_train = preprocess_dataset(train_dataset)

X_validation, y_validation = preprocess_dataset(validation_dataset)

print(f"train data: {len(X_train)} validation: {len(X_validation)}")
print(f"Dont buys: {y_train.count(0)}, buys: {y_train.count(1)}")
print(f"VALIDATION Dont buys: {y_validation.count(0)}, buys: {y_validation.count(1)}")

train data: 60656 validation: 21110
Dont buys: 30328, buys: 30328
VALIDATION Dont buys: 10555, buys: 10555


## MODEL DEFINITION, TRAINING AND VALIDATION
we already have a normalized and balanced dataset to be used to train our model, in the following steps we will create a model, we will train it and validate their performance

In [28]:
import time

# SET OF VARIABLES USED TO TRAIN AND SAVE OUR MODEL:
EPOCHS = 10
BATCH_SIZE = 64
NAME = f'{RATIO_TO_PREDICT}-SEQ-{FUTURE_PERIOD_PREDICTION}-PRED-{int(time.time())}'

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

lets create the RNN LSTM Model

In [33]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

In [34]:
# Compile the model:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(
    loss= 'sparse_categorical_crossentropy',
    optimizer= opt,
    metrics= ['accuracy']
)

In [35]:
# Tensorboard callback
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

# Checkpoint
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}.hdf5"
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

In [36]:
# Train the model:

history = model.fit(
    X_train, y_train,
    batch_size= BATCH_SIZE,
    epochs= EPOCHS,
    validation_data= (X_validation, y_validation), 
    callbacks= [tensorboard, checkpoint] 
)

# Score model
score = model.evaluate(X_validation, y_validation, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Train on 60656 samples, validate on 21110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6801659889776861
Test accuracy: 0.566508763636079
