# Part 8

In [1]:
import pandas as pd
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time

In [2]:
df = pd.read_csv("Datasets/crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])
df.head()
# this dataset consist of data of every minute in sequence for 60 minutes

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [3]:
df.shape

(101883, 6)

In [4]:
# All the cryptocurrencys share the time column so we join them as per time
main_df = pd.DataFrame() # begin empty
ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]

for ratio in ratios:
    dataset = f'Datasets/crypto_data/{ratio}.csv'
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    # print(df.head())
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True) 
    # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  
    # ignore the other columns
    
    if len(main_df)==0:
        main_df = df 
    else: 
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  
# if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
main_df.head() 
    

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033


In [5]:
SEQ_LEN = 60  
# Use last 60 min data to predict
# how much minutes before-data we want RNN to use for prediction
FUTURE_PERIOD_PREDICT = 3  
# how far into the future are we trying to predict
RATIO_TO_PREDICT = "LTC-USD"
# what are we trying to predict

# All these means that we use last 60min data to predict next 3min for LTC

In [6]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [7]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
# This column has shifted values up by FUTURE_PERIOD_PREDICT places 

In [8]:
main_df

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.010010,26.019083,96.389999
1528968780,6479.410156,3.088252,96.570000,77.129799,870.099976,1.124300,486.000000,8.449400,96.519997
1528968840,6479.410156,1.404100,96.500000,7.216067,870.789978,1.749862,485.750000,26.994646,96.440002
1528968900,6479.979980,0.753000,96.389999,524.539978,870.000000,1.680500,486.000000,77.355759,96.470001
1528968960,6480.000000,1.490900,96.519997,16.991997,869.989990,1.669014,486.000000,7.503300,96.400002
1528969020,6477.220215,2.731950,96.440002,95.524078,869.450012,0.865200,485.989990,85.877251,96.400002
1528969080,6480.000000,2.174240,96.470001,175.205307,869.989990,23.534929,485.989990,160.915192,96.400002
1528969140,6479.990234,0.903100,96.400002,43.652802,870.000000,2.300000,485.989990,61.371887,96.400002
1528969200,6478.660156,3.258786,96.400002,8.160000,870.320007,9.255514,485.989990,42.687656,96.400002
1528969260,6478.660156,1.970352,96.400002,20.425900,870.650024,2.795600,486.000000,97.693878,96.449997


In [9]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
# we create list of classified values and assign it as a column

In [10]:
main_df.head(10)

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0
1528969020,6477.220215,2.73195,96.440002,95.524078,869.450012,0.8652,485.98999,85.877251,96.400002,0
1528969080,6480.0,2.17424,96.470001,175.205307,869.98999,23.534929,485.98999,160.915192,96.400002,0
1528969140,6479.990234,0.9031,96.400002,43.652802,870.0,2.3,485.98999,61.371887,96.400002,0
1528969200,6478.660156,3.258786,96.400002,8.16,870.320007,9.255514,485.98999,42.687656,96.400002,0
1528969260,6478.660156,1.970352,96.400002,20.4259,870.650024,2.7956,486.0,97.693878,96.449997,1


# Part 9,10

In [11]:
# for training and testing we donot take randomly but take sequence
# so we use last 5% of data for validation
times = sorted(main_df.index.values)  
# get the times

last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  
# get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  
# make the validation data where the index is in the last 5%

main_df = main_df[(main_df.index < last_5pct)]  
# now the main_df is all the data up to the last 5%

In [12]:
def preprocess_df(df):
    df = df.drop("future", 1)  
    # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  
            # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  
            # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  
            # scale between 0 and 1.

    df.dropna(inplace=True)
    sequential_data = []  
    # this is a list that will contain the sequences for 60 days
    prev_days = deque(maxlen=SEQ_LEN)  
    # These will be our actual sequences. 
    # They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  
        # store all columns except the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])

    random.shuffle(sequential_data)  # shuffle for good measure.
    
    # Part 10
    # We want to balance our dataset so that it trains faster
    
    buys = []  
    # list that will store our buy sequences and targets
    sells = []  
    # list that will store our sell sequences and targets

    for seq, target in sequential_data:
        if target == 0:  
            sells.append([seq, target])  
        elif target == 1:
            buys.append([seq, target])

    random.shuffle(buys)
    random.shuffle(sells) 

    lower = min(len(buys), len(sells))

    buys = buys[:lower]  
    sells = sells[:lower]  
    # make sure both lists are only up to the shortest length.
    # so equal numbers of buys and sells

    sequential_data = buys+sells  # add them together 
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.
    
    X = []
    y = []

    for seq, target in sequential_data: 
        X.append(seq)
        y.append(target)

    return np.array(X), y 
    

In [13]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

# Part 11

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint
import time

In [27]:
EPOCHS = 10  
BATCH_SIZE = 64  
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [28]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

In [29]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [30]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

In [34]:
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"
# unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))
# saves only the best ones

In [37]:
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

Train on 77922 samples, validate on 3860 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))