<a href="https://colab.research.google.com/github/iamdsc/deep_learning/blob/master/13_cryptocurrency_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Download the data
!wget https://github.com/iamdsc/deep_learning/raw/master/crypto_data.zip

--2019-03-23 16:12:51--  https://github.com/iamdsc/deep_learning/raw/master/crypto_data.zip
Resolving github.com (github.com)... 140.82.118.3, 140.82.118.4
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/iamdsc/deep_learning/master/crypto_data.zip [following]
--2019-03-23 16:12:52--  https://raw.githubusercontent.com/iamdsc/deep_learning/master/crypto_data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5998694 (5.7M) [application/zip]
Saving to: ‘crypto_data.zip.1’


2019-03-23 16:12:52 (108 MB/s) - ‘crypto_data.zip.1’ saved [5998694/5998694]



In [0]:
# unpack it
!unzip crypto_data.zip

In [18]:
# read in the csv files
import pandas as pd

df = pd.read_csv('crypto_data/LTC-USD.csv', names=['time', 'low', 'high', 'open', 'close', 'volume'])
print(df.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [19]:
# combining the data into one dataframe

main_df = pd.DataFrame()

# the 4 ratios we want to consider
ratios = ['BTC-USD', 'LTC-USD', 'BCH-USD', 'ETH-USD']

for ratio in ratios:
  print(ratio)
  dataset = f'crypto_data/{ratio}.csv'
  
  df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
  
  # rename the volume and close to include the ticker
  df.rename(columns={'close':f'{ratio}_close','volume':f'{ratio}_volume'}, inplace=True)
  
  # we set time as index so we can join on this shared column
  df.set_index('time', inplace=True)
  
  df = df[[f'{ratio}_close',f'{ratio}_volume']]
  
  if len(main_df)==0:
    main_df = df
  
  else:
    main_df = main_df.join(df)

main_df.fillna(method='ffill', inplace=True)
main_df.dropna(inplace=True)

print(main_df.head())

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [0]:
# defining contants
SEQ_LEN = 60 # how long of a predicting seq to collect for RNN
FUTURE_PERIOD_PREDICT = 3 # how far in the future we will predict
RATIO_TO_PREDICT = 'LTC-USD'

In [21]:
# classification function to map to target
def classify(current, future):
  if float(future) > float(current):
    return 1 
  else:
    return 0

main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528968720     870.859985       26.856577      486.01001       26.019083   
1528968780     870.099976        1.124300      486.00000        8.449400   
1528968840     870.789978        1.749862      485.75000       26.994646   
1528968900     870.000000        1.680500      486.00000       77.355759   
1528968960 

In [0]:
# seperating the validation/out of sample data
times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))] # get last 5% threshold

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

In [0]:
# preprocessing the dataframe to create sequences and normalize
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

def preprocess_df(df):
  df = df.drop('future', 1)
  
  for col in df.columns:
    if col != 'target': # normalize all except target
      df[col] = df[col].pct_change() # normalizes the different currencies
      df.dropna(inplace=True)
      df[col] = preprocessing.scale(df[col].values)
    
  df.dropna(inplace=True)
  
  # building the sequences
  sequential_data = []
  prev_days = deque(maxlen=SEQ_LEN) # our actual sequences
  
  for i in df.values:
    prev_days.append([n for n in i[:-1]]) # exclude target
    if len(prev_days) == SEQ_LEN:
      sequential_data.append([np.array(prev_days), i[-1]])
  
  # balancing the data
  buys = []
  sells = []
  
  for seq, target in sequential_data:
    if target == 0:
      sells.append([seq, target])
    
    elif target == 1:
      buys.append([seq, target])
  
  lower = min(len(buys), len(sells)) # to get shorter length
  
  buys = buys[:lower]
  sells = sells[:lower]
  
  sequential_data = buys + sells
  
  random.shuffle(sequential_data)
  
  # split the data back to feature sets and targets
  X = []
  y = []
  
  for seq, target in sequential_data:
    X.append(seq)
    y.append(target)
    
  return np.array(X), y

In [0]:
# preprocessing the data
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [0]:
# making few more constants
import time


EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [0]:
# build the model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint


model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

In [0]:
# compile the model
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

filepath = 'RNN_Final-{epoch:02d}-{val_acc:.3f}'
checkpoint = ModelCheckpoint('models/{}.model'.format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

In [28]:
# train the model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

# score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:',score[0])
print('Test accuracy:',score[1])

# save the model
model.save(f'models/{NAME}')

Train on 77922 samples, validate on 3860 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.1451953160330421
Test accuracy: 0.5455959
