In [1]:
# predict future crypto price using LSTM sequence on close and volume. This is binary classification
import pandas as pd
df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])

In [2]:
# Pandas DataFrame is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes 
# (rows and columns). A Data frame is a two-dimensional data structure, 
# i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal 
# components, the data, rows, and columns.
main_df = pd.DataFrame() # begin empty

# We want to combine each of the csv file for the crypto currency together into the empty pandas datafram
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
    #print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still distinguish which close/volume is which for each csv:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True) \
    # inplace = True, data is renamed in place, replace nothing
    # inplace = False, df2 = df.rename(..., inplace=False)
    # when ^ inplace is false, result is assigned to a new variable

    # set time column as index so we can join them on this shared time
    df.set_index("time", inplace=True)  
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides closing price and volume

    # merging of all the csvs
    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
#print(main_df.head())

In [3]:
# Create a target (how far out we want to predict)
# if we have a sequence length of 3 (3 minutes of historical data), cant predict 10 minutes in the future
# if we have sequence length of 300, 10 might be predictable
# we will go with sequence length of 60 and future prediction, target, of 3

SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "BCH-USD"

In [4]:
# function takes future and current value. 1 if future>current, 0 otherwise
# Train network based on these, 1 is good, price increase in future, 0 is bad. (Binary classification)
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [5]:
# future_period_predict is 3
# A .shift will just shift the columns for us, a negative shift will shift them "up." 
# So shifting up 3 will give us the price 3 minutes in the future, and we're just assigning this to a new column.
# This creates a future column based on the 3 rows forward (future price)
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

# The map() is used to map a function. 
# The first parameter here is the function we want to map (classify),
# then the next ones are the parameters to that function. 
# In this case, the current close price, and then the future price.
# The map part is what allows us to do this row-by-row for these columns, 
# but also do it quite fast. The list part converts the end result to a list, which we can just set as a column.
# target is a binary column of 1 increase in future price, 0 decrease in future price
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

In [6]:
print(main_df[[f"{RATIO_TO_PREDICT}_close","future","target"]].head(10)) 

            BCH-USD_close      future  target
time                                         
1528968720     870.859985  870.000000       0
1528968780     870.099976  869.989990       0
1528968840     870.789978  869.450012       0
1528968900     870.000000  869.989990       0
1528968960     869.989990  870.000000       1
1528969020     869.450012  870.320007       1
1528969080     869.989990  870.650024       1
1528969140     870.000000  871.219971       1
1528969200     870.320007  871.880005       1
1528969260     870.650024  871.880005       1


In [7]:
# Part 9
# for sequence / lstm / timeseries prediction, do not shuffle data and slice it to training and testing sets
# this will cause overfitting, just slice the data into train n test set in its order. Take the last 5% as test set
times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% in time index

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

In [8]:
# normalize n balance data
# balance - make sure the classes have equal amounts when training (use class weights)

from sklearn import preprocessing  
from collections import deque
import random
import numpy as np

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore, was only needed to create the target (actual output label)

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.
    df.dropna(inplace=True)  # cleanup again... jic.
    
    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        # n for n means each column within the row, currently contains  => btc close, btc volume, ltc close, ltc volume, ..., target
        # think of it as n1 for n2 - each n1(column) within the n2(row)
        # in i[:-1] => this is to remove the last column target. You dont want target inside.
        # [:-1] means up to the last i, means we are not taking target
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those 60 feature set and target label (X and y)!

    random.shuffle(sequential_data)  # shuffle for good measure.
    
    # part 10
    # now we gonna balance the list, so that we have even target 0 and 1
    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.
    
    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array! ..import numpy as np

In [9]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 79532 validation: 3606
Dont buys: 39766, buys: 39766
VALIDATION Dont buys: 1803, buys: 1803


In [10]:
# Part 11
# Building the model
import time

EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
# modelcheckpoint allows the saving of the best epoch, so u can revert back if model is overfitted or something like that
from tensorflow.keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
# general use case is to use BN between the linear and non-linear layers in your network, 
# because it normalizes the input to your activation function, so that you're centered in the linear section of 
# the activation function (such as Sigmoid).
# https://www.reddit.com/r/MachineLearning/comments/2x0bq8/some_questions_regarding_batch_normalization/?su=ynbwk&st=iprg6e3w&sh=88bcbe40
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

  from ._conv import register_converters as _register_converters


In [11]:
# compile the model
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [12]:
tensorboard = TensorBoard(log_dir="logs\{}".format(NAME))
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models\{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [14]:
# train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard]
)

W0723 07:02:15.675613 19276 deprecation.py:323] From C:\Users\Lawrann\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 79532 samples, validate on 3606 samples
Epoch 1/10
   64/79532 [..............................] - ETA: 40:15 - loss: 0.8463 - accuracy: 0.4844

W0723 07:02:23.917753 19276 callbacks.py:241] Method (on_train_batch_end) is slow compared to the batch update (0.951948). Check your callbacks.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# Load TENSORBOARD
%load_ext tensorboard
# Start TENSORBOARD
%tensorboard --logdir=C:\Users\Lawrann\tensorflowNN\logs
# if running through cmd prompt, must be at C:\

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 1188), started 21:41:53 ago. (Use '!kill 1188' to kill it.)

W0723 07:44:17.360505 19276 manager.py:321] invalid info file: 'C:\\Users\\Lawrann\\AppData\\Local\\Temp\\.tensorboard-info\\pid-16176.info'
Traceback (most recent call last):
  File "C:\Users\Lawrann\Anaconda3\lib\site-packages\tensorboard\manager.py", line 316, in get_all
    info = _info_from_string(contents)
  File "C:\Users\Lawrann\Anaconda3\lib\site-packages\tensorboard\manager.py", line 155, in _info_from_string
    raise ValueError("incompatible version: %r" % (json_value,))
ValueError: incompatible version: {'cache_key': 'eyJhcmd1bWVudHMiOlsiLS1sb2dkaXIiLCJsb2dzIl0sImNvbmZpZ3VyZV9rd2FyZ3MiOnt9LCJ3b3JraW5nX2RpcmVjdG9yeSI6IkM6XFxVc2Vyc1xcTGF3cmFublxcdGVuc29yZmxvd05OIn0=', 'db': '', 'logdir': 'logs', 'path_prefix': '', 'pid': 16176, 'port': 6006, 'start_time': 1563760101, 'version': '1.14.0'}
W0723 07:44:17.368484 19276 manager.py:321] invalid info file: 'C:\\Users\\Lawrann\\AppData\\Local\\Temp\\.tensorboard-info\\pid-2084.info'
Traceback (most recent call last):
  File "C:\User

W0723 07:44:17.421343 19276 manager.py:321] invalid info file: 'C:\\Users\\Lawrann\\AppData\\Local\\Temp\\.tensorboard-info\\pid-708.info'
Traceback (most recent call last):
  File "C:\Users\Lawrann\Anaconda3\lib\site-packages\tensorboard\manager.py", line 316, in get_all
    info = _info_from_string(contents)
  File "C:\Users\Lawrann\Anaconda3\lib\site-packages\tensorboard\manager.py", line 155, in _info_from_string
    raise ValueError("incompatible version: %r" % (json_value,))
ValueError: incompatible version: {'cache_key': 'eyJhcmd1bWVudHMiOlsiLS1sb2dkaXIiLCJ7QzpVc2Vyc0xhd3Jhbm50ZW5zb3JmbG93Tk5sb2dzfSJdLCJjb25maWd1cmVfa3dhcmdzIjp7fSwid29ya2luZ19kaXJlY3RvcnkiOiJDOlxcVXNlcnNcXExhd3Jhbm5cXHRlbnNvcmZsb3dOTiJ9', 'db': '', 'logdir': '{C:UsersLawranntensorflowNNlogs}', 'path_prefix': '', 'pid': 708, 'port': 6006, 'start_time': 1563760256, 'version': '1.14.0'}
W0723 07:44:17.423341 19276 manager.py:321] invalid info file: 'C:\\Users\\Lawrann\\AppData\\Local\\Temp\\.tensorboard-info\\pid-7