Skip to content

Commit

Permalink
Initial code and data commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jaungiers committed Jul 15, 2017
1 parent b0a6905 commit a8aaab3
Show file tree
Hide file tree
Showing 8 changed files with 920 additions and 2 deletions.
354 changes: 354 additions & 0 deletions Bitcoin LSTM Prediction.ipynb

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions README.md
@@ -1,2 +1,5 @@
# Multidimensional-LSTM-BitCoin-Time-Series
Using multidimensional LSTM neural networks to create a forecast for Bitcoin price
# Multidimensional LSTM BitCoin Time Series

Using multidimensional LSTM neural networks to create a forecast for Bitcoin price.

For notes around this code and a general explenation for the theory please see my original article [HERE](http://www.jakob-aungiers.com/articles/a/Multidimensional-LSTM-Networks-to-Predict-Bitcoin-Price)
247 changes: 247 additions & 0 deletions Saved Predictions.ipynb

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions configs.json
@@ -0,0 +1,24 @@
{
"data": {
"filename": "data/bitcoin.csv",
"filename_clean": "data/clean_data.h5",
"filter_columns": [
"Open",
"Close",
"Volume_(BTC)",
"Volume_(Currency)"
],
"batch_size": 100,
"train_test_split": 0.8,
"x_window_size": 150,
"y_window_size": 1,
"y_predict_column": "Close"
},
"model": {
"epochs": 2,
"loss_function": "mse",
"optimiser_function": "Nadam",
"filename_model": "data/model_saved.h5",
"filename_predictions": "data/model_predictions.h5"
}
}
Binary file added data/bitcoin.zip
Binary file not shown.
111 changes: 111 additions & 0 deletions etl.py
@@ -0,0 +1,111 @@
import h5py
import numpy as np
import pandas as pd

class ETL:
"""Extract Transform Load class for all data operations pre model inputs. Data is read in generative way to allow for large datafiles and low memory utilisation"""

def generate_clean_data(self, filename, batch_size=1000, start_index=0):
with h5py.File(filename, 'r') as hf:
i = start_index
while True:
data_x = hf['x'][i:i+batch_size]
data_y = hf['y'][i:i+batch_size]
i += batch_size
yield (data_x, data_y)

def create_clean_datafile(self, filename_in, filename_out, batch_size=1000, x_window_size=100, y_window_size=1, y_col=0, filter_cols=None, normalise=True):
"""Incrementally save a datafile of clean data ready for loading straight into model"""
print('> Creating x & y data files...')

data_gen = self.clean_data(
filename_in,
batch_size = batch_size,
x_window_size = x_window_size,
y_window_size = y_window_size,
y_col = y_col,
filter_cols = filter_cols,
normalise = True
)

i = 0
with h5py.File(filename_out, 'w') as hf:
x1, y1 = next(data_gen)
#Initialise hdf5 x, y datasets with first chunk of data
rcount_x = x1.shape[0]
dset_x = hf.create_dataset('x', shape=x1.shape, maxshape=(None, x1.shape[1], x1.shape[2]), chunks=True)
dset_x[:] = x1
rcount_y = y1.shape[0]
dset_y = hf.create_dataset('y', shape=y1.shape, maxshape=(None,), chunks=True)
dset_y[:] = y1

for x_batch, y_batch in data_gen:
#Append batches to x, y hdf5 datasets
print('> Creating x & y data files | Batch:', i, end='\r')
dset_x.resize(rcount_x + x_batch.shape[0], axis=0)
dset_x[rcount_x:] = x_batch
rcount_x += x_batch.shape[0]
dset_y.resize(rcount_y + y_batch.shape[0], axis=0)
dset_y[rcount_y:] = y_batch
rcount_y += y_batch.shape[0]
i += 1

print('> Clean datasets created in file `' + filename_out + '.h5`')

def clean_data(self, filepath, batch_size, x_window_size, y_window_size, y_col, filter_cols, normalise):
"""Cleans and Normalises the data in batches `batch_size` at a time"""
data = pd.read_csv(filepath, index_col=0)

if(filter_cols):
#Remove any columns from data that we don't need by getting the difference between cols and filter list
rm_cols = set(data.columns) - set(filter_cols)
for col in rm_cols:
del data[col]

#Convert y-predict column name to numerical index
y_col = list(data.columns).index(y_col)

num_rows = len(data)
x_data = []
y_data = []
i = 0
while((i+x_window_size+y_window_size) <= num_rows):
x_window_data = data[i:(i+x_window_size)]
y_window_data = data[(i+x_window_size):(i+x_window_size+y_window_size)]

#Remove any windows that contain NaN
if(x_window_data.isnull().values.any() or y_window_data.isnull().values.any()):
i += 1
continue

if(normalise):
abs_base, x_window_data = self.zero_base_standardise(x_window_data)
_, y_window_data = self.zero_base_standardise(y_window_data, abs_base=abs_base)

#Average of the desired predicter y column
y_average = np.average(y_window_data.values[:, y_col])
x_data.append(x_window_data.values)
y_data.append(y_average)
i += 1

#Restrict yielding until we have enough in our batch. Then clear x, y data for next batch
if(i % batch_size == 0):
#Convert from list to 3 dimensional numpy array [windows, window_val, val_dimension]
x_np_arr = np.array(x_data)
y_np_arr = np.array(y_data)
x_data = []
y_data = []
yield (x_np_arr, y_np_arr)

def zero_base_standardise(self, data, abs_base=pd.DataFrame()):
"""Standardise dataframe to be zero based percentage returns from i=0"""
if(abs_base.empty): abs_base = data.iloc[0]
data_standardised = (data/abs_base)-1
return (abs_base, data_standardised)

def min_max_normalise(self, data, data_min=pd.DataFrame(), data_max=pd.DataFrame()):
"""Normalise a Pandas dataframe using column-wise min-max normalisation (can use custom min, max if desired)"""
if(data_min.empty): data_min = data.min()
if(data_max.empty): data_max = data.max()
data_normalised = (data-data_min)/(data_max-data_min)
return (data_min, data_max, data_normalised)
47 changes: 47 additions & 0 deletions lstm.py
@@ -0,0 +1,47 @@
import os
import time
import json
import warnings
import numpy as np
from numpy import newaxis
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.models import load_model

configs = json.loads(open(os.path.join(os.path.dirname(__file__), 'configs.json')).read())
warnings.filterwarnings("ignore") #Hide messy Numpy warnings

def build_network(layers):
model = Sequential()

model.add(LSTM(
input_dim=layers[0],
output_dim=layers[1],
return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
layers[2],
return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(
output_dim=layers[3]))
model.add(Activation("tanh"))

start = time.time()
model.compile(
loss=configs['model']['loss_function'],
optimizer=configs['model']['optimiser_function'])

print("> Compilation Time : ", time.time() - start)
return model

def load_network(filename):
#Load the h5 saved model and weights
if(os.path.isfile(filename)):
return load_model(filename)
else:
print('ERROR: "' + filename + '" file does not exist as a h5 model')
return None
132 changes: 132 additions & 0 deletions run.py
@@ -0,0 +1,132 @@
import time
import time
import threading
import lstm, etl, json
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
configs = json.loads(open('configs.json').read())
tstart = time.time()

def plot_results(predicted_data, true_data):
fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
plt.plot(predicted_data, label='Prediction')
plt.legend()
plt.show()

def predict_sequences_multiple(model, data, window_size, prediction_len):
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps
prediction_seqs = []
for i in range(int(len(data)/prediction_len)):
curr_frame = data[i*prediction_len]
predicted = []
for j in range(prediction_len):
predicted.append(model.predict(curr_frame[np.newaxis,:,:])[0,0])
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
prediction_seqs.append(predicted)
return prediction_seqs

def plot_results_multiple(predicted_data, true_data, prediction_len):
fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(padding + data, label='Prediction')
plt.legend()
plt.show()

true_values = []
def generator_strip_xy(data_gen, true_values):
for x, y in data_gen_test:
true_values += list(y)
yield x

def fit_model_threaded(model, data_gen_train, steps_per_epoch, configs):
"""thread worker for model fitting - so it doesn't freeze on jupyter notebook"""
model = lstm.build_network([ncols, 150, 150, 1])
model.fit_generator(
data_gen_train,
steps_per_epoch=steps_per_epoch,
epochs=configs['model']['epochs']
)
model.save(configs['model']['filename_model'])
print('> Model Trained! Weights saved in', configs['model']['filename_model'])
return

dl = etl.ETL()
dl.create_clean_datafile(
filename_in = configs['data']['filename'],
filename_out = configs['data']['filename_clean'],
batch_size = configs['data']['batch_size'],
x_window_size = configs['data']['x_window_size'],
y_window_size = configs['data']['y_window_size'],
y_col = configs['data']['y_predict_column'],
filter_cols = configs['data']['filter_columns'],
normalise = True
)

print('> Generating clean data from:', configs['data']['filename_clean'], 'with batch_size:', configs['data']['batch_size'])

data_gen_train = dl.generate_clean_data(
configs['data']['filename_clean'],
batch_size=configs['data']['batch_size']
)

with h5py.File(configs['data']['filename_clean'], 'r') as hf:
nrows = hf['x'].shape[0]
ncols = hf['x'].shape[2]

ntrain = int(configs['data']['train_test_split'] * nrows)
steps_per_epoch = int((ntrain / configs['model']['epochs']) / configs['data']['batch_size'])
print('> Clean data has', nrows, 'data rows. Training on', ntrain, 'rows with', steps_per_epoch, 'steps-per-epoch')

model = lstm.build_network([ncols, 150, 150, 1])
t = threading.Thread(target=fit_model_threaded, args=[model, data_gen_train, steps_per_epoch, configs])
t.start()

data_gen_test = dl.generate_clean_data(
configs['data']['filename_clean'],
batch_size=configs['data']['batch_size'],
start_index=ntrain
)

ntest = nrows - ntrain
steps_test = int(ntest / configs['data']['batch_size'])
print('> Testing model on', ntest, 'data rows with', steps_test, 'steps')

predictions = model.predict_generator(
generator_strip_xy(data_gen_test, true_values),
steps=steps_test
)

#Save our predictions
with h5py.File(configs['model']['filename_predictions'], 'w') as hf:
dset_p = hf.create_dataset('predictions', data=predictions)
dset_y = hf.create_dataset('true_values', data=true_values)

plot_results(predictions[:800], true_values[:800])

#Reload the data-generator
data_gen_test = dl.generate_clean_data(
configs['data']['filename_clean'],
batch_size=800,
start_index=ntrain
)
data_x, true_values = next(data_gen_test)
window_size = 50 #numer of steps to predict into the future

#We are going to cheat a bit here and just take the next 400 steps from the testing generator and predict that data in its whole
predictions_multiple = predict_sequences_multiple(
model,
data_x,
data_x[0].shape[0],
window_size
)

plot_results_multiple(predictions_multiple, true_values, window_size)

0 comments on commit a8aaab3

Please sign in to comment.