In [1]:
# downloading the data
# !wget -P ./data https://pythonprogramming.net/static/downloads/machine-learning-data/crypto_data.zip

In [2]:
# core libs
import os
import time
import random
from collections import deque

# preprocessing libs
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

# tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [3]:
# VARIABLES

# path
CWD = os.getcwd()
DATADIR = os.path.join(CWD, 'data', 'crypto_data')

# ml
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = 'ETH-USD'
EPOCHS = 10
BATCH_SIZE = 64
NAME = '{}-{}-SEQ-{}-PRED-{}'.format(RATIO_TO_PREDICT, SEQ_LEN, FUTURE_PERIOD_PREDICT, int(time.time()))

In [4]:
# if future price is greater than current prices, buy the asset, coded as 1
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [5]:
def preprocess_df(df):
    # drop future column
    df = df.drop(labels='future', axis=1)

    # iterate over all columns
    # except target columns
    # use percent of change instead of absolute values
    # drop first row, because there is not data prior that
    # scale aka normalize data between 0 and 1
    for col in df.columns:
        if col != 'target':
            df.loc[:, col] = df.loc[:, col].pct_change()
            df = df.dropna()
            df.loc[:, col] = scale(df.loc[:, col].values)

    # drop NAs
    df = df.dropna()

    # deque builds a list with fixed length
    # when the list is full, e.g. len(deque) > maxlen
    # deque pops oldeste items
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)

    # iterate over all columns
    # iterate over all values
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])

    # shuffle the data
    random.shuffle(sequential_data)

    # create two lists with buys and sells
    buys = []
    sells = []
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    random.shuffle(buys)
    random.shuffle(sells)

    # balance the lists
    # find the minimum between both lists
    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]

    # join both lists
    # shuffle the data
    sequential_data = buys + sells
    random.shuffle(sequential_data)

    # split features and target
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)

    return np.array(X), y

In [6]:
# name of the columns
names = ['time', 'low', 'high', 'open', 'close', 'volume']
# import the data with pandas
df = pd.read_csv(os.path.join(DATADIR, 'LTC-USD.csv'), names=names)
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [7]:
main_df = pd.DataFrame()

# file names
ratios = ['BCH-USD', 'BTC-USD', 'ETH-USD', 'LTC-USD']

# join all files into one big dataframe called main_df
for ratio in ratios:
    dataset = os.path.join(DATADIR, '{}.csv'.format(ratio))
    df = pd.read_csv(dataset, names=names)
    
    df = df.rename(columns={'close': '{}_close'.format(ratio), 'volume': '{}_volume'.format(ratio)})
    df = df.set_index('time')
    df = df.loc[:, ['{}_close'.format(ratio), '{}_volume'.format(ratio)]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
# make future price by negative shifting the data by FUTURE_PERIOD_PREDICT
main_df.loc[:, 'future'] = main_df.loc[:, '{}_close'.format(RATIO_TO_PREDICT)].shift(-FUTURE_PERIOD_PREDICT)

# make target by using the classify function
main_df.loc[:, 'target'] = list(map(classify,
                                    main_df.loc[:, '{}_close'.format(RATIO_TO_PREDICT)],
                                    main_df.loc[:, 'future']))

In [8]:
# check if everything worked
main_df.loc[:, ['{}_close'.format(RATIO_TO_PREDICT), 'future', 'target']].head(10)

Unnamed: 0_level_0,ETH-USD_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968660,,485.75,0
1528968720,486.01001,486.0,0
1528968780,486.0,486.0,0
1528968840,485.75,485.98999,1
1528968900,486.0,485.98999,0
1528968960,486.0,485.98999,0
1528969020,485.98999,485.98999,0
1528969080,485.98999,486.0,1
1528969140,485.98999,486.0,1
1528969200,485.98999,486.0,1


In [9]:
# take all indices
# find out what is the beginning of the last 5% of the data
# take the last 5% - most recent 5% - to validate the model
# the rest is train data
times = sorted(main_df.index.values)
last5pct = times[-int(0.05 * len(times))]
validation_main_df = main_df.loc[main_df.index >= last5pct, ]
main_df = main_df.loc[main_df.index < last5pct, ]

In [10]:
%%time
# apply the preprocess_df function
X_train, y_train = preprocess_df(main_df)
X_val, y_val = preprocess_df(validation_main_df)

CPU times: user 8.04 s, sys: 240 ms, total: 8.28 s
Wall time: 5.92 s


In [11]:
print('train data: {} validation: {}'.format(len(X_train), len(X_val)))
print('Dont buys: {}, buys: {}'.format(y_train.count(0), y_train.count(1)))
print('VALIDATION Dont buys: {}, buys: {}'.format(y_val.count(0), y_val.count(1)))

train data: 74464 validation: 3910
Dont buys: 37232, buys: 37232
VALIDATION Dont buys: 1955, buys: 1955


In [12]:
model = Sequential()

# LSTM layer
model.add(CuDNNLSTM(units=128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(BatchNormalization()) # normalize data between batches

# LSTM layer
model.add(CuDNNLSTM(units=128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(BatchNormalization())

# LSTM layer
model.add(CuDNNLSTM(units=128, input_shape=(X_train.shape[1:])))
model.add(Dropout(rate=0.2))
model.add(BatchNormalization())

# add a dense layer
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(rate=0.2))

# output layer
model.add(Dense(units=2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/{}'.format(NAME))

# unique file name that will include the epoch and the validation acc for that epoch
# saves only the best one
filepath = 'RNN_Final-{epoch:02d}-{val_acc:.3f}'
checkpoint = ModelCheckpoint('models/{}.model'.format(filepath),
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

In [13]:
%%time
history = model.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard, checkpoint])

Train on 74464 samples, validate on 3910 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.50767, saving model to models/RNN_Final-01-0.508.model
Epoch 2/10

Epoch 00002: val_acc improved from 0.50767 to 0.53887, saving model to models/RNN_Final-02-0.539.model
Epoch 3/10

Epoch 00003: val_acc improved from 0.53887 to 0.55985, saving model to models/RNN_Final-03-0.560.model
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.55985
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.55985
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.55985
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.55985
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.55985
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.55985
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.55985
CPU times: user 2min 48s, sys: 10.3 s, total: 2min 58s
Wall time: 2min 18s


In [14]:
# TensorBoard
# !tensorboard --logdir='logs/'