# Neural Network

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random
import time
from keras import models
from keras import layers
from keras import callbacks
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


We first define the lenght of the sequence (SEQ_LEN) that we will be using to predict a certain cryptocurrency (RATIO_TO_PREDICT) into a future time (FUTURE_PERIOD_PREDICT).

In [2]:
SEQ_LEN = 72  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict
RATIO_TO_PREDICT = "ETHTUSD"

Same as we did for machine learning we will be creating our labels based on a % change in prices for our cryptocurrency.

In [3]:
def buy_sell_hold(future):
    '''Return 1, 0, 2 based on % change
    
       Args:
           future: price timeseries lagged into future
    '''
    change_perc = 0.01
    
    if future > change_perc:
        return 1
    if future < -change_perc:
        return 0
    return 2

In [4]:
def preprocess_df(df):
    '''Define labels X, y creating a column containing the values of the price difference normalized
       based on the hours we want to predict in future
    
       Args:
           df: df containing closing prices for cryptocurrencies we want to predict
           
       Returns:
           np.array(X): sequences we will use as feature to predict
           y: target variable
    '''
    df = df.drop("pct_change", 1)  # don't need this anymore.
    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.fillna(method='ffill')  # remove the nas created by pct_change
            df = df.replace([np.inf, -np.inf], 0)
            df.fillna(0, inplace=True)
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... 


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. 
    
    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!
    
    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets
    holds = []  # list that will store our sell sequences and targets
    
    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!
        else:
            holds.append([seq, target])
    
    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!
    random.shuffle(holds)  # shuffle the holds!
    
    lower = min(len(buys), len(sells), len(holds))  # what's the shorter length?
    
    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.
    holds = holds[:lower]  # make sure both lists are only up to the shortest length.
    
    sequential_data = buys+sells+holds  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy vs holds)

    return np.array(X), y  # return X and y...and make X a numpy array!

Define 5 symbols.

In [5]:
ratios = ['BTCTUSD', "ETHTUSD" , "XRPTUSD", "LTCTUSD", "EOSTUSD"]

Create main df with close prices and volume for each symbol.

In [20]:
crp_df = pd.DataFrame() # begin empty

for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'crypto_dfs/{ratio}.csv'  # get the full path to the file.
    
    df = pd.read_csv(dataset, index_col=0)  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"Close": f"{ratio}_close", "Volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("Time", inplace=True)  # set time as index so we can join them on this shared time
    
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(crp_df)==0:  # if the dataframe is empty
        crp_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        crp_df = crp_df.join(df)

print(crp_df.head())

BTCTUSD
ETHTUSD
XRPTUSD
LTCTUSD
EOSTUSD
               BTCTUSD_close  BTCTUSD_volume  ETHTUSD_close  ETHTUSD_volume  \
Time                                                                          
1581346800000        9914.98       42.847840         224.48       437.23191   
1581343200000        9867.88       10.544541         223.24       921.30562   
1581339600000        9811.44       12.148842         219.41       672.55340   
1581336000000        9838.86       10.340284         219.73       279.35173   
1581332400000        9807.88       22.957431         217.91       203.97147   

               XRPTUSD_close  XRPTUSD_volume  LTCTUSD_close  LTCTUSD_volume  \
Time                                                                          
1581346800000        0.27595        150520.2          74.87      1236.85021   
1581343200000        0.27370         20433.0          74.44       300.63211   
1581339600000        0.27156          9748.0          73.41       227.99520   
15813360000

Lag price into future through pct_change.

In our case 24 hours.

In [7]:
crp_df['pct_change'] = crp_df[[f'{RATIO_TO_PREDICT}_close']].pct_change(24)
crp_df[['pct_change', f'{RATIO_TO_PREDICT}_close']].head()

Unnamed: 0_level_0,pct_change,ETHTUSD_close
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
1581346800000,,224.48
1581343200000,,223.24
1581339600000,,219.41
1581336000000,,219.73
1581332400000,,217.91


Apply our previous function to get our target variables.

In [8]:
crp_df['target'] = crp_df['pct_change'].map(buy_sell_hold)

In [9]:
crp_df['target'].value_counts(normalize=True)

0    0.350962
1    0.326122
2    0.322917
Name: target, dtype: float64

Split df in 2 parts for training and testing.

In [10]:
times = sorted(crp_df.index.values)  # get the times
last_20pct = sorted(crp_df.index.values)[-int(0.2*len(times))]  # get the last 20% of the times

In [11]:
validation_main_df = crp_df[(crp_df.index >= last_20pct)]  # make the validation data where the index is in the last 5%
main_df = crp_df[(crp_df.index < last_20pct)]  # now the main_df is all the data up to the last 5%

In [12]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"Train data: {len(train_x)} Validation: {len(validation_x)}")
print(f"Sells: {train_y.count(0)}, Holds: {train_y.count(2)}, Buys: {train_y.count(1)}")
print(f"VALIDATION Sells: {validation_y.count(0)}, Holds: {validation_y.count(2)}, buys: {validation_y.count(1)}")

Train data: 1779 Validation: 282
Sells: 593, Holds: 593, Buys: 593
VALIDATION Sells: 94, Holds: 94, buys: 94


Let's make a few more constants:

In [13]:
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

## Build model

In [14]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3, activation='tanh'))

In [15]:
# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

In [16]:
tensorboard = TensorBoard(log_dir=f"logs/{NAME}")

filepath = "LSTM_Final-{epoch:02d}-{val_acc:.3f}"
# unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) 
# saves only the best ones

## Train model

In [17]:
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

Train on 1779 samples, validate on 282 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Score model

In [18]:
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 2.2425889157234353
Test accuracy: 0.5248226967263729


## Save model

In [19]:
#model.save("models/{}".format(NAME))

# Comparison table between algorithms 


| Algorithm            | Accuracy |  
|:-------------:       |------:        |  
| Neural network       |    0.60        |    
|  Logistic regression | 0.34        |
|    Decision tree     |   0.38        | 
| Random forest        |    0.48        |   