In [29]:
import pandas as pd
from IPython.display import display
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
display(df.head())

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [30]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
display(main_df.head())  # how did we do??

BTC-USD
LTC-USD
BCH-USD
ETH-USD


Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033


In [31]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "ETH-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [32]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [33]:
from sklearn import preprocessing  # pip install sklearn ... if you don't have it!
from collections import deque
import numpy as np
import random

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic. Those nasty NaNs love to creep in.
    
    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.
    
    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys + sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array! ..import numpy as np

In [34]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
display(main_df[[f"{RATIO_TO_PREDICT}_close", "future"]].head())

Unnamed: 0_level_0,ETH-USD_close,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1528968720,486.01001,486.0
1528968780,486.0,486.0
1528968840,485.75,485.98999
1528968900,486.0,485.98999
1528968960,486.0,485.98999


In [35]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
display(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head(10))

Unnamed: 0_level_0,ETH-USD_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968720,486.01001,486.0,0
1528968780,486.0,486.0,0
1528968840,485.75,485.98999,1
1528968900,486.0,485.98999,0
1528968960,486.0,485.98999,0
1528969020,485.98999,485.98999,0
1528969080,485.98999,486.0,1
1528969140,485.98999,486.0,1
1528969200,485.98999,486.0,1
1528969260,486.0,486.0,0


In [36]:
times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

In [37]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y =preprocess_df(validation_main_df)

In [38]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 83428 validation: 4116
Dont buys: 41714, buys: 41714
VALIDATION Dont buys: 2058, buys: 2058


In [39]:
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
validation_x = np.asarray(validation_x)
validation_y = np.asarray(validation_y)

In [40]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [41]:
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

Epoch 1/10




INFO:tensorflow:Assets written to: models\RNN_Final-01-0.508.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-01-0.508.model\assets


Epoch 2/10




INFO:tensorflow:Assets written to: models\RNN_Final-02-0.542.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-02-0.542.model\assets


Epoch 3/10




INFO:tensorflow:Assets written to: models\RNN_Final-03-0.528.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-03-0.528.model\assets


Epoch 4/10




INFO:tensorflow:Assets written to: models\RNN_Final-04-0.550.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-04-0.550.model\assets


Epoch 5/10




INFO:tensorflow:Assets written to: models\RNN_Final-05-0.565.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-05-0.565.model\assets


Epoch 6/10




INFO:tensorflow:Assets written to: models\RNN_Final-06-0.557.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-06-0.557.model\assets


Epoch 7/10




INFO:tensorflow:Assets written to: models\RNN_Final-07-0.554.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-07-0.554.model\assets


Epoch 8/10




INFO:tensorflow:Assets written to: models\RNN_Final-08-0.554.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-08-0.554.model\assets


Epoch 9/10




INFO:tensorflow:Assets written to: models\RNN_Final-09-0.555.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-09-0.555.model\assets


Epoch 10/10




INFO:tensorflow:Assets written to: models\RNN_Final-10-0.559.model\assets


INFO:tensorflow:Assets written to: models\RNN_Final-10-0.559.model\assets


In [42]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

Test loss: 0.6841108798980713
Test accuracy: 0.5592808723449707




INFO:tensorflow:Assets written to: models/ETH-USD-60-SEQ-3-PRED-1636785005\assets


INFO:tensorflow:Assets written to: models/ETH-USD-60-SEQ-3-PRED-1636785005\assets
