Initial code and data commit

jaungiers · Jul 15, 2017 · a8aaab3 · a8aaab3
1 parent b0a6905
commit a8aaab3
Show file tree

Hide file tree

Showing 8 changed files with 920 additions and 2 deletions.
diff --git a/Bitcoin LSTM Prediction.ipynb b/Bitcoin LSTM Prediction.ipynb
diff --git a/README.md b/README.md
@@ -1,2 +1,5 @@
-# Multidimensional-LSTM-BitCoin-Time-Series
-Using multidimensional LSTM neural networks to create a forecast for Bitcoin price
+# Multidimensional LSTM BitCoin Time Series
+
+Using multidimensional LSTM neural networks to create a forecast for Bitcoin price.
+
+For notes around this code and a general explenation for the theory please see my original article 	[HERE](http://www.jakob-aungiers.com/articles/a/Multidimensional-LSTM-Networks-to-Predict-Bitcoin-Price)
diff --git a/Saved Predictions.ipynb b/Saved Predictions.ipynb
diff --git a/configs.json b/configs.json
@@ -0,0 +1,24 @@
+{
+    "data": {
+        "filename": "data/bitcoin.csv",
+        "filename_clean": "data/clean_data.h5",
+        "filter_columns": [
+            "Open",
+            "Close",
+            "Volume_(BTC)",
+            "Volume_(Currency)"
+        ],
+        "batch_size": 100,
+        "train_test_split": 0.8,
+        "x_window_size": 150,
+        "y_window_size": 1,
+        "y_predict_column": "Close"
+    },
+    "model": {
+        "epochs": 2,
+        "loss_function": "mse",
+        "optimiser_function": "Nadam",
+        "filename_model": "data/model_saved.h5",
+        "filename_predictions": "data/model_predictions.h5"
+    }
+}
diff --git a/data/bitcoin.zip b/data/bitcoin.zip
diff --git a/etl.py b/etl.py
@@ -0,0 +1,111 @@
+import h5py
+import numpy as np
+import pandas as pd
+
+class ETL:
+	"""Extract Transform Load class for all data operations pre model inputs. Data is read in generative way to allow for large datafiles and low memory utilisation"""
+
+	def generate_clean_data(self, filename, batch_size=1000, start_index=0):
+		with h5py.File(filename, 'r') as hf:
+			i = start_index
+			while True:
+				data_x = hf['x'][i:i+batch_size]
+				data_y = hf['y'][i:i+batch_size]
+				i += batch_size
+				yield (data_x, data_y)
+
+	def create_clean_datafile(self, filename_in, filename_out, batch_size=1000, x_window_size=100, y_window_size=1, y_col=0, filter_cols=None, normalise=True):
+		"""Incrementally save a datafile of clean data ready for loading straight into model"""
+		print('> Creating x & y data files...')
+
+		data_gen = self.clean_data(
+			filename_in,
+			batch_size = batch_size,
+			x_window_size = x_window_size,
+			y_window_size = y_window_size,
+			y_col = y_col,
+			filter_cols = filter_cols,
+			normalise = True
+		)
+
+		i = 0
+		with h5py.File(filename_out, 'w') as hf:
+			x1, y1 = next(data_gen)
+			#Initialise hdf5 x, y datasets with first chunk of data
+			rcount_x = x1.shape[0]
+			dset_x = hf.create_dataset('x', shape=x1.shape, maxshape=(None, x1.shape[1], x1.shape[2]), chunks=True)
+			dset_x[:] = x1
+			rcount_y = y1.shape[0]
+			dset_y = hf.create_dataset('y', shape=y1.shape, maxshape=(None,), chunks=True)
+			dset_y[:] = y1
+
+			for x_batch, y_batch in data_gen:
+				#Append batches to x, y hdf5 datasets
+				print('> Creating x & y data files | Batch:', i, end='\r')
+				dset_x.resize(rcount_x + x_batch.shape[0], axis=0)
+				dset_x[rcount_x:] = x_batch
+				rcount_x += x_batch.shape[0]
+				dset_y.resize(rcount_y + y_batch.shape[0], axis=0)
+				dset_y[rcount_y:] = y_batch
+				rcount_y += y_batch.shape[0]
+				i += 1
+
+		print('> Clean datasets created in file `' + filename_out + '.h5`')
+
+	def clean_data(self, filepath, batch_size, x_window_size, y_window_size, y_col, filter_cols, normalise):
+		"""Cleans and Normalises the data in batches `batch_size` at a time"""
+		data = pd.read_csv(filepath, index_col=0)
+
+		if(filter_cols):
+			#Remove any columns from data that we don't need by getting the difference between cols and filter list
+			rm_cols = set(data.columns) - set(filter_cols)
+			for col in rm_cols:
+				del data[col]
+
+		#Convert y-predict column name to numerical index
+		y_col = list(data.columns).index(y_col)
+
+		num_rows = len(data)
+		x_data = []
+		y_data = []
+		i = 0
+		while((i+x_window_size+y_window_size) <= num_rows):
+			x_window_data = data[i:(i+x_window_size)]
+			y_window_data = data[(i+x_window_size):(i+x_window_size+y_window_size)]
+
+			#Remove any windows that contain NaN
+			if(x_window_data.isnull().values.any() or y_window_data.isnull().values.any()):
+				i += 1
+				continue
+
+			if(normalise):
+				abs_base, x_window_data = self.zero_base_standardise(x_window_data)
+				_, y_window_data = self.zero_base_standardise(y_window_data, abs_base=abs_base)
+
+			#Average of the desired predicter y column
+			y_average = np.average(y_window_data.values[:, y_col])
+			x_data.append(x_window_data.values)
+			y_data.append(y_average)
+			i += 1
+
+			#Restrict yielding until we have enough in our batch. Then clear x, y data for next batch
+			if(i % batch_size == 0):
+				#Convert from list to 3 dimensional numpy array [windows, window_val, val_dimension]
+				x_np_arr = np.array(x_data)
+				y_np_arr = np.array(y_data)
+				x_data = []
+				y_data = []
+				yield (x_np_arr, y_np_arr)
+
+	def zero_base_standardise(self, data, abs_base=pd.DataFrame()):
+		"""Standardise dataframe to be zero based percentage returns from i=0"""
+		if(abs_base.empty): abs_base = data.iloc[0]
+		data_standardised = (data/abs_base)-1
+		return (abs_base, data_standardised)
+
+	def min_max_normalise(self, data, data_min=pd.DataFrame(), data_max=pd.DataFrame()):
+		"""Normalise a Pandas dataframe using column-wise min-max normalisation (can use custom min, max if desired)"""
+		if(data_min.empty): data_min = data.min()
+		if(data_max.empty): data_max = data.max()
+		data_normalised = (data-data_min)/(data_max-data_min)
+		return (data_min, data_max, data_normalised)
diff --git a/lstm.py b/lstm.py
@@ -0,0 +1,47 @@
+import os
+import time
+import json
+import warnings
+import numpy as np
+from numpy import newaxis
+from keras.layers.core import Dense, Activation, Dropout
+from keras.layers.recurrent import LSTM
+from keras.models import Sequential
+from keras.models import load_model
+
+configs = json.loads(open(os.path.join(os.path.dirname(__file__), 'configs.json')).read())
+warnings.filterwarnings("ignore") #Hide messy Numpy warnings
+
+def build_network(layers):
+    model = Sequential()
+
+    model.add(LSTM(
+        input_dim=layers[0],
+        output_dim=layers[1],
+        return_sequences=True))
+    model.add(Dropout(0.2))
+
+    model.add(LSTM(
+        layers[2],
+        return_sequences=False))
+    model.add(Dropout(0.2))
+
+    model.add(Dense(
+        output_dim=layers[3]))
+    model.add(Activation("tanh"))
+
+    start = time.time()
+    model.compile(
+        loss=configs['model']['loss_function'],
+        optimizer=configs['model']['optimiser_function'])
+
+    print("> Compilation Time : ", time.time() - start)
+    return model
+
+def load_network(filename):
+    #Load the h5 saved model and weights
+    if(os.path.isfile(filename)):
+        return load_model(filename)
+    else:
+        print('ERROR: "' + filename + '" file does not exist as a h5 model')
+        return None
diff --git a/run.py b/run.py
@@ -0,0 +1,132 @@
+import time
+import time
+import threading
+import lstm, etl, json
+import numpy as np
+import pandas as pd
+import h5py
+import matplotlib.pyplot as plt
+configs = json.loads(open('configs.json').read())
+tstart = time.time()
+
+def plot_results(predicted_data, true_data):
+    fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
+    ax = fig.add_subplot(111)
+    ax.plot(true_data, label='True Data')
+    plt.plot(predicted_data, label='Prediction')
+    plt.legend()
+    plt.show()
+
+def predict_sequences_multiple(model, data, window_size, prediction_len):
+    #Predict sequence of 50 steps before shifting prediction run forward by 50 steps
+    prediction_seqs = []
+    for i in range(int(len(data)/prediction_len)):
+        curr_frame = data[i*prediction_len]
+        predicted = []
+        for j in range(prediction_len):
+            predicted.append(model.predict(curr_frame[np.newaxis,:,:])[0,0])
+            curr_frame = curr_frame[1:]
+            curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
+        prediction_seqs.append(predicted)
+    return prediction_seqs
+
+def plot_results_multiple(predicted_data, true_data, prediction_len):
+    fig=plt.figure(figsize=(18, 12), dpi= 80, facecolor='w', edgecolor='k')
+    ax = fig.add_subplot(111)
+    ax.plot(true_data, label='True Data')
+    #Pad the list of predictions to shift it in the graph to it's correct start
+    for i, data in enumerate(predicted_data):
+        padding = [None for p in range(i * prediction_len)]
+        plt.plot(padding + data, label='Prediction')
+        plt.legend()
+    plt.show()
+
+true_values = []
+def generator_strip_xy(data_gen, true_values):
+    for x, y in data_gen_test:
+        true_values += list(y)
+        yield x
+
+def fit_model_threaded(model, data_gen_train, steps_per_epoch, configs):
+    """thread worker for model fitting - so it doesn't freeze on jupyter notebook"""
+    model = lstm.build_network([ncols, 150, 150, 1])
+    model.fit_generator(
+        data_gen_train,
+        steps_per_epoch=steps_per_epoch,
+        epochs=configs['model']['epochs']
+    )
+    model.save(configs['model']['filename_model'])
+    print('> Model Trained! Weights saved in', configs['model']['filename_model'])
+    return
+
+dl = etl.ETL()
+dl.create_clean_datafile(
+    filename_in = configs['data']['filename'],
+    filename_out = configs['data']['filename_clean'],
+    batch_size = configs['data']['batch_size'],
+    x_window_size = configs['data']['x_window_size'],
+    y_window_size = configs['data']['y_window_size'],
+    y_col = configs['data']['y_predict_column'],
+    filter_cols = configs['data']['filter_columns'],
+    normalise = True
+)
+
+print('> Generating clean data from:', configs['data']['filename_clean'], 'with batch_size:', configs['data']['batch_size'])
+
+data_gen_train = dl.generate_clean_data(
+    configs['data']['filename_clean'],
+    batch_size=configs['data']['batch_size']
+)
+
+with h5py.File(configs['data']['filename_clean'], 'r') as hf:
+    nrows = hf['x'].shape[0]
+    ncols = hf['x'].shape[2]
+
+ntrain = int(configs['data']['train_test_split'] * nrows)
+steps_per_epoch = int((ntrain / configs['model']['epochs']) / configs['data']['batch_size'])
+print('> Clean data has', nrows, 'data rows. Training on', ntrain, 'rows with', steps_per_epoch, 'steps-per-epoch')
+
+model = lstm.build_network([ncols, 150, 150, 1])
+t = threading.Thread(target=fit_model_threaded, args=[model, data_gen_train, steps_per_epoch, configs])
+t.start()
+
+data_gen_test = dl.generate_clean_data(
+    configs['data']['filename_clean'],
+    batch_size=configs['data']['batch_size'],
+    start_index=ntrain
+)
+
+ntest = nrows - ntrain
+steps_test = int(ntest / configs['data']['batch_size'])
+print('> Testing model on', ntest, 'data rows with', steps_test, 'steps')
+
+predictions = model.predict_generator(
+    generator_strip_xy(data_gen_test, true_values),
+    steps=steps_test
+)
+
+#Save our predictions
+with h5py.File(configs['model']['filename_predictions'], 'w') as hf:
+    dset_p = hf.create_dataset('predictions', data=predictions)
+    dset_y = hf.create_dataset('true_values', data=true_values)
+
+plot_results(predictions[:800], true_values[:800])
+
+#Reload the data-generator
+data_gen_test = dl.generate_clean_data(
+    configs['data']['filename_clean'],
+    batch_size=800,
+    start_index=ntrain
+)
+data_x, true_values = next(data_gen_test)
+window_size = 50 #numer of steps to predict into the future
+
+#We are going to cheat a bit here and just take the next 400 steps from the testing generator and predict that data in its whole
+predictions_multiple = predict_sequences_multiple(
+    model,
+    data_x,
+    data_x[0].shape[0],
+    window_size
+)
+
+plot_results_multiple(predictions_multiple, true_values, window_size)