Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
920 additions
and
2 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
# Multidimensional-LSTM-BitCoin-Time-Series | ||
Using multidimensional LSTM neural networks to create a forecast for Bitcoin price | ||
# Multidimensional LSTM BitCoin Time Series | ||
|
||
Using multidimensional LSTM neural networks to create a forecast for Bitcoin price. | ||
|
||
For notes around this code and a general explenation for the theory please see my original article [HERE](http://www.jakob-aungiers.com/articles/a/Multidimensional-LSTM-Networks-to-Predict-Bitcoin-Price) |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"data": { | ||
"filename": "data/bitcoin.csv", | ||
"filename_clean": "data/clean_data.h5", | ||
"filter_columns": [ | ||
"Open", | ||
"Close", | ||
"Volume_(BTC)", | ||
"Volume_(Currency)" | ||
], | ||
"batch_size": 100, | ||
"train_test_split": 0.8, | ||
"x_window_size": 150, | ||
"y_window_size": 1, | ||
"y_predict_column": "Close" | ||
}, | ||
"model": { | ||
"epochs": 2, | ||
"loss_function": "mse", | ||
"optimiser_function": "Nadam", | ||
"filename_model": "data/model_saved.h5", | ||
"filename_predictions": "data/model_predictions.h5" | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import h5py | ||
import numpy as np | ||
import pandas as pd | ||
|
||
class ETL: | ||
"""Extract Transform Load class for all data operations pre model inputs. Data is read in generative way to allow for large datafiles and low memory utilisation""" | ||
|
||
def generate_clean_data(self, filename, batch_size=1000, start_index=0): | ||
with h5py.File(filename, 'r') as hf: | ||
i = start_index | ||
while True: | ||
data_x = hf['x'][i:i+batch_size] | ||
data_y = hf['y'][i:i+batch_size] | ||
i += batch_size | ||
yield (data_x, data_y) | ||
|
||
def create_clean_datafile(self, filename_in, filename_out, batch_size=1000, x_window_size=100, y_window_size=1, y_col=0, filter_cols=None, normalise=True): | ||
"""Incrementally save a datafile of clean data ready for loading straight into model""" | ||
print('> Creating x & y data files...') | ||
|
||
data_gen = self.clean_data( | ||
filename_in, | ||
batch_size = batch_size, | ||
x_window_size = x_window_size, | ||
y_window_size = y_window_size, | ||
y_col = y_col, | ||
filter_cols = filter_cols, | ||
normalise = True | ||
) | ||
|
||
i = 0 | ||
with h5py.File(filename_out, 'w') as hf: | ||
x1, y1 = next(data_gen) | ||
#Initialise hdf5 x, y datasets with first chunk of data | ||
rcount_x = x1.shape[0] | ||
dset_x = hf.create_dataset('x', shape=x1.shape, maxshape=(None, x1.shape[1], x1.shape[2]), chunks=True) | ||
dset_x[:] = x1 | ||
rcount_y = y1.shape[0] | ||
dset_y = hf.create_dataset('y', shape=y1.shape, maxshape=(None,), chunks=True) | ||
dset_y[:] = y1 | ||
|
||
for x_batch, y_batch in data_gen: | ||
#Append batches to x, y hdf5 datasets | ||
print('> Creating x & y data files | Batch:', i, end='\r') | ||
dset_x.resize(rcount_x + x_batch.shape[0], axis=0) | ||
dset_x[rcount_x:] = x_batch | ||
rcount_x += x_batch.shape[0] | ||
dset_y.resize(rcount_y + y_batch.shape[0], axis=0) | ||
dset_y[rcount_y:] = y_batch | ||
rcount_y += y_batch.shape[0] | ||
i += 1 | ||
|
||
print('> Clean datasets created in file `' + filename_out + '.h5`') | ||
|
||
def clean_data(self, filepath, batch_size, x_window_size, y_window_size, y_col, filter_cols, normalise): | ||
"""Cleans and Normalises the data in batches `batch_size` at a time""" | ||
data = pd.read_csv(filepath, index_col=0) | ||
|
||
if(filter_cols): | ||
#Remove any columns from data that we don't need by getting the difference between cols and filter list | ||
rm_cols = set(data.columns) - set(filter_cols) | ||
for col in rm_cols: | ||
del data[col] | ||
|
||
#Convert y-predict column name to numerical index | ||
y_col = list(data.columns).index(y_col) | ||
|
||
num_rows = len(data) | ||
x_data = [] | ||
y_data = [] | ||
i = 0 | ||
while((i+x_window_size+y_window_size) <= num_rows): | ||
x_window_data = data[i:(i+x_window_size)] | ||
y_window_data = data[(i+x_window_size):(i+x_window_size+y_window_size)] | ||
|
||
#Remove any windows that contain NaN | ||
if(x_window_data.isnull().values.any() or y_window_data.isnull().values.any()): | ||
i += 1 | ||
continue | ||
|
||
if(normalise): | ||
abs_base, x_window_data = self.zero_base_standardise(x_window_data) | ||
_, y_window_data = self.zero_base_standardise(y_window_data, abs_base=abs_base) | ||
|
||
#Average of the desired predicter y column | ||
y_average = np.average(y_window_data.values[:, y_col]) | ||
x_data.append(x_window_data.values) | ||
y_data.append(y_average) | ||
i += 1 | ||
|
||
#Restrict yielding until we have enough in our batch. Then clear x, y data for next batch | ||
if(i % batch_size == 0): | ||
#Convert from list to 3 dimensional numpy array [windows, window_val, val_dimension] | ||
x_np_arr = np.array(x_data) | ||
y_np_arr = np.array(y_data) | ||
x_data = [] | ||
y_data = [] | ||
yield (x_np_arr, y_np_arr) | ||
|
||
def zero_base_standardise(self, data, abs_base=pd.DataFrame()): | ||
"""Standardise dataframe to be zero based percentage returns from i=0""" | ||
if(abs_base.empty): abs_base = data.iloc[0] | ||
data_standardised = (data/abs_base)-1 | ||
return (abs_base, data_standardised) | ||
|
||
def min_max_normalise(self, data, data_min=pd.DataFrame(), data_max=pd.DataFrame()): | ||
"""Normalise a Pandas dataframe using column-wise min-max normalisation (can use custom min, max if desired)""" | ||
if(data_min.empty): data_min = data.min() | ||
if(data_max.empty): data_max = data.max() | ||
data_normalised = (data-data_min)/(data_max-data_min) | ||
return (data_min, data_max, data_normalised) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import os | ||
import time | ||
import json | ||
import warnings | ||
import numpy as np | ||
from numpy import newaxis | ||
from keras.layers.core import Dense, Activation, Dropout | ||
from keras.layers.recurrent import LSTM | ||
from keras.models import Sequential | ||
from keras.models import load_model | ||
|
||
configs = json.loads(open(os.path.join(os.path.dirname(__file__), 'configs.json')).read()) | ||
warnings.filterwarnings("ignore") #Hide messy Numpy warnings | ||
|
||
def build_network(layers): | ||
model = Sequential() | ||
|
||
model.add(LSTM( | ||
input_dim=layers[0], | ||
output_dim=layers[1], | ||
return_sequences=True)) | ||
model.add(Dropout(0.2)) | ||
|
||
model.add(LSTM( | ||
layers[2], | ||
return_sequences=False)) | ||
model.add(Dropout(0.2)) | ||
|
||
model.add(Dense( | ||
output_dim=layers[3])) | ||
model.add(Activation("tanh")) | ||
|
||
start = time.time() | ||
model.compile( | ||
loss=configs['model']['loss_function'], | ||
optimizer=configs['model']['optimiser_function']) | ||
|
||
print("> Compilation Time : ", time.time() - start) | ||
return model | ||
|
||
def load_network(filename): | ||
#Load the h5 saved model and weights | ||
if(os.path.isfile(filename)): | ||
return load_model(filename) | ||
else: | ||
print('ERROR: "' + filename + '" file does not exist as a h5 model') | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import time | ||
import time | ||
import threading | ||
import lstm, etl, json | ||
import numpy as np | ||
import pandas as pd | ||
import h5py | ||
import matplotlib.pyplot as plt | ||
configs = json.loads(open('configs.json').read()) | ||
tstart = time.time() | ||
|
||
def plot_results(predicted_data, true_data): | ||
fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k') | ||
ax = fig.add_subplot(111) | ||
ax.plot(true_data, label='True Data') | ||
plt.plot(predicted_data, label='Prediction') | ||
plt.legend() | ||
plt.show() | ||
|
||
def predict_sequences_multiple(model, data, window_size, prediction_len): | ||
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps | ||
prediction_seqs = [] | ||
for i in range(int(len(data)/prediction_len)): | ||
curr_frame = data[i*prediction_len] | ||
predicted = [] | ||
for j in range(prediction_len): | ||
predicted.append(model.predict(curr_frame[np.newaxis,:,:])[0,0]) | ||
curr_frame = curr_frame[1:] | ||
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) | ||
prediction_seqs.append(predicted) | ||
return prediction_seqs | ||
|
||
def plot_results_multiple(predicted_data, true_data, prediction_len): | ||
fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k') | ||
ax = fig.add_subplot(111) | ||
ax.plot(true_data, label='True Data') | ||
#Pad the list of predictions to shift it in the graph to it's correct start | ||
for i, data in enumerate(predicted_data): | ||
padding = [None for p in range(i * prediction_len)] | ||
plt.plot(padding + data, label='Prediction') | ||
plt.legend() | ||
plt.show() | ||
|
||
true_values = [] | ||
def generator_strip_xy(data_gen, true_values): | ||
for x, y in data_gen_test: | ||
true_values += list(y) | ||
yield x | ||
|
||
def fit_model_threaded(model, data_gen_train, steps_per_epoch, configs): | ||
"""thread worker for model fitting - so it doesn't freeze on jupyter notebook""" | ||
model = lstm.build_network([ncols, 150, 150, 1]) | ||
model.fit_generator( | ||
data_gen_train, | ||
steps_per_epoch=steps_per_epoch, | ||
epochs=configs['model']['epochs'] | ||
) | ||
model.save(configs['model']['filename_model']) | ||
print('> Model Trained! Weights saved in', configs['model']['filename_model']) | ||
return | ||
|
||
dl = etl.ETL() | ||
dl.create_clean_datafile( | ||
filename_in = configs['data']['filename'], | ||
filename_out = configs['data']['filename_clean'], | ||
batch_size = configs['data']['batch_size'], | ||
x_window_size = configs['data']['x_window_size'], | ||
y_window_size = configs['data']['y_window_size'], | ||
y_col = configs['data']['y_predict_column'], | ||
filter_cols = configs['data']['filter_columns'], | ||
normalise = True | ||
) | ||
|
||
print('> Generating clean data from:', configs['data']['filename_clean'], 'with batch_size:', configs['data']['batch_size']) | ||
|
||
data_gen_train = dl.generate_clean_data( | ||
configs['data']['filename_clean'], | ||
batch_size=configs['data']['batch_size'] | ||
) | ||
|
||
with h5py.File(configs['data']['filename_clean'], 'r') as hf: | ||
nrows = hf['x'].shape[0] | ||
ncols = hf['x'].shape[2] | ||
|
||
ntrain = int(configs['data']['train_test_split'] * nrows) | ||
steps_per_epoch = int((ntrain / configs['model']['epochs']) / configs['data']['batch_size']) | ||
print('> Clean data has', nrows, 'data rows. Training on', ntrain, 'rows with', steps_per_epoch, 'steps-per-epoch') | ||
|
||
model = lstm.build_network([ncols, 150, 150, 1]) | ||
t = threading.Thread(target=fit_model_threaded, args=[model, data_gen_train, steps_per_epoch, configs]) | ||
t.start() | ||
|
||
data_gen_test = dl.generate_clean_data( | ||
configs['data']['filename_clean'], | ||
batch_size=configs['data']['batch_size'], | ||
start_index=ntrain | ||
) | ||
|
||
ntest = nrows - ntrain | ||
steps_test = int(ntest / configs['data']['batch_size']) | ||
print('> Testing model on', ntest, 'data rows with', steps_test, 'steps') | ||
|
||
predictions = model.predict_generator( | ||
generator_strip_xy(data_gen_test, true_values), | ||
steps=steps_test | ||
) | ||
|
||
#Save our predictions | ||
with h5py.File(configs['model']['filename_predictions'], 'w') as hf: | ||
dset_p = hf.create_dataset('predictions', data=predictions) | ||
dset_y = hf.create_dataset('true_values', data=true_values) | ||
|
||
plot_results(predictions[:800], true_values[:800]) | ||
|
||
#Reload the data-generator | ||
data_gen_test = dl.generate_clean_data( | ||
configs['data']['filename_clean'], | ||
batch_size=800, | ||
start_index=ntrain | ||
) | ||
data_x, true_values = next(data_gen_test) | ||
window_size = 50 #numer of steps to predict into the future | ||
|
||
#We are going to cheat a bit here and just take the next 400 steps from the testing generator and predict that data in its whole | ||
predictions_multiple = predict_sequences_multiple( | ||
model, | ||
data_x, | ||
data_x[0].shape[0], | ||
window_size | ||
) | ||
|
||
plot_results_multiple(predictions_multiple, true_values, window_size) |