In [1]:
import numpy as np #Import relevant packages
import pandas as pd
import math
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import glob
import csv
import matplotlib.pyplot as plt

In [2]:
def unpack_data(filepath): #Returns a dictionary where each item is a df containing one run. One dict per generation.
    all_files = glob.glob(filepath + "/*.csv")
    name_list = []
    datadict = {}
    for f in all_files:
        name_list.append(f[77:-4])
    for i,n in enumerate(name_list):
        datadict[n] = pd.read_csv(all_files[i], header=0)
    for k, v in datadict.items():
        v.drop(columns=["Unnamed: 0", "volume", "spread", "10_MA", "50_MA"], inplace=True)
    return datadict

def format_data(datadict, window): #Turn dictionaries of data into features and samples lists
    x_data = []
    y_data = []
    for k, v in datadict.items():
        for i in range(len(v)):
            if i >= window:
                y_data.append(v.iloc[i]["trading_price"])
                xi = v.iloc[i-window:i].to_numpy()
                x_data.append(xi)
    x_data = np.array(x_data)
    y_data = np.array(y_data)
    y_data = np.reshape(y_data, (y_data.shape[0], 1))
    return x_data, y_data

def normalise_data(xarray, yarray): #Take the natural log and then normalise prices
    yarray_log = np.log(yarray)
    xarray_log = np.log(xarray)
    xsc = StandardScaler()
    instances, timesteps, features = xarray_log.shape
    xarray_log = np.reshape(xarray_log, (-1, features))
    xarray_norm = xsc.fit_transform(xarray_log)
    xarray_norm = np.reshape(xarray_norm, (instances, timesteps, features))
    ysc = StandardScaler().fit(yarray_log)
    yarray_norm = ysc.transform(yarray_log)
    return xarray_norm, yarray_norm, xsc, ysc

def split_data(xarray, yarray, trainratio): #Split the data into training and test sets
    train_len = int(len(xarray)*trainratio)
    test_len = len(xarray) - train_len
    x_train, y_train, x_test, y_test = xarray[:train_len], yarray[:train_len], xarray[train_len:], yarray[train_len:]
    return x_train, y_train, x_test, y_test

In [None]:
gen2 = unpack_data("/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/gen2_sims/data")

In [None]:
X3, y3 = format_data(gen2, 20)

In [None]:
X3_train, y3_train, X3_test, y3_test = split_data(X3, y3, 0.8)

In [None]:
X3_train, y3_train, X3scale, y3scale = normalise_data(X3_train, y3_train)
X3_test, y3_test, _, _ = normalise_data(X3_test, y3_test)

In [None]:
print(X3scale.mean_)
print(X3scale.var_)
print(y3scale.mean_)
print(y3scale.var_)
#print(X3_train)
#print(y3_train)

In [None]:
lstm3=keras.Sequential() #Set up the architecture of the model

lstm3.add(layers.LSTM(units=16, return_sequences=True, input_shape=(X3_train.shape[1], X3_train.shape[2])))
lstm3.add(layers.Dropout(0.2))

lstm3.add(layers.LSTM(units=16, return_sequences=True))
lstm3.add(layers.Dropout(0.2))

lstm3.add(layers.Dense(units=1))

In [None]:
print(lstm3.summary())

In [None]:
lstm3.compile(optimizer='nadam', loss='mean_squared_error') #Compile and train the model

history3 = lstm3.fit(X3_train, y3_train, epochs = 10, batch_size = 32, validation_split=0.2, shuffle=False)

In [None]:
lstm3.save("/Users/karangarg/Documents/Year 3 Modules/EC331/Code/rae_repo/lstms/lstm3")

In [None]:
plt.figure(figsize=(15,6))
plt.plot(history3.history["loss"], label="train")
plt.plot(history3.history["val_loss"], label="validation")
plt.legend

In [None]:
y3_pred = lstm3.predict(X3_test)
y3_train_inv = y3scale.inverse_transform(y3_train.reshape(1, -1))
y3_test_inv = y3scale.inverse_transform(y3_test.reshape(1, -1))
y3_pred_inv  = y3scale.inverse_transform(y3_pred)

In [None]:
plt.figure(figsize=(15,6))
plt.plot(y3_test_inv.flatten(), label="Test data")
plt.plot(y3_pred_inv.flatten(), label="Model prediction")
plt.legend()