In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Change this flag to False to not plot anything
PLOT = True
     
# Imports dataset
df = pd.read_csv("spx_data.csv")

# Gets rid of time
data = df.drop("Time", 1)

# Creates a list of closing prices as our features
closing_price = data['SPX'].to_numpy().reshape(-1,1)

if PLOT:
  # Shows price action as well as some important data
  fig,ax1 = plt.subplots()

  ax1.plot(data['SPX'])
  ax2 = ax1.twinx()
  ax2.plot(data['VIX'],c='r')
  ax2.plot(data['VOLSPD']/100000000,c='orange')
  plt.title("S&P 500 10 min tick")

  plt.show()

  # Shows correlation heat map
  plt.figure(figsize=(10,8))
  corr = data.corr()
  sns.heatmap(corr, center=0)
  plt.show()

## Data pre-processing, Package import, Define critical function

In [2]:
# Initial Scaling of data
pd.options.mode.chained_assignment = None  # default='warn'
train_data = data
train_data['VIX'] = 1/85 * train_data['VIX']# Highest ever recorded VIX value
train_data['ADSPD'] = 1/505 * train_data['ADSPD']# Highest number possible for $ADSPD

from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, AveragePooling1D, Flatten, RepeatVector, TimeDistributed


# Creates time series dataset to pass into Conv_LSTM NN
# n_prev: how many previous values to include
# n_fut: how many outputs to include (future data)
def create_time_series(x_data, y_data, n_prev, n_fut=1):
  x,y = [],[]
  for i in range(n_prev, x_data.shape[0]-n_fut):
    x.append(x_data[i-n_prev:i])
    y.append(y_data[i:i+n_fut])

  return np.array(x),np.array(y)


# Phase 1: MLP model declaration

In [3]:
# Creates model and data process for Phase 1
def phase1_mlp():

  model = Sequential()
  model.add(Dense(32,activation="relu", input_dim=5))
  model.add(Dense(32,activation="relu"))
  model.add(Dense(32,activation="tanh"))
  model.add(Dense(1,activation="sigmoid"))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.summary()

  # epochs, batch_size=32
  return model

# Execute 1

In [None]:
model = phase1_mlp()
train = train_data.drop(["TVOLSP","Return","Return5h","Return1h", "Return30min"],1)
train_x = train.iloc[:,:5]
train_y = train.iloc[:,5:]

scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(train_x)[:6916]
y = train_y[:6916]


assert x_scaled.shape[0] == y.shape[0]

x_train, x_test = x_scaled[:5500], x_scaled[5500:]
y_train, y_test = y.iloc[:5500,0], y.iloc[5500:,0]

model.fit(x_train,y_train,epochs=50,batch_size=32,verbose=1,validation_data=(x_test,y_test))

# Phase 1.5: Conv 1D RNN model declaration

In [None]:
# Creates Conv1D and RNN model
def phase1_5rnn():
  conv_model = Sequential()
  conv_model.add(Conv1D(128,2,input_shape=(300,5),activation='relu'))
  conv_model.add(AveragePooling1D(2,1))
  conv_model.add(Conv1D(64,2,activation='relu'))
  conv_model.add(AveragePooling1D(2,1))
  conv_model.add(LSTM(64))
  conv_model.add(Dense(1,activation="sigmoid"))
  conv_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  conv_model.summary()

  return conv_model

# Execute 1.5

In [None]:
train = train_data.drop(["TVOLSP","Return","Return5h","Return1h", "Return30min"],1)
train_x = train.iloc[:,:5]
train_y = train.iloc[:,5:]

scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(train_x)[:6916]
y = train_y[:6916]
y_data = y.to_numpy()

x_train, x_test = x_scaled[:5500], x_scaled[5500:]
y_train, y_test = y_data[:5500], y_data[5500:]


x_train,y_train = create_time_series(x_train, y_train,300)
x_test,y_test = create_time_series(x_test,y_test,300)

y_train = y_train.reshape(-1,3)
y_test = y_test.reshape(-1,3)

x_train.shape, y_train.shape
conv_model = phase1_5rnn()
conv_model.fit(x_train,y_train[:,0],verbose=1,epochs=50,validation_data=(x_test,y_test[:,0]))

# Phase 2: LSTM model declaration

In [None]:
# Creates LSTM model for regression
def phase2_lstm():
  n_prev, n_features, n_fut = 500, 5, 100
  model = Sequential()
  model.add(LSTM(100, activation='relu', input_shape=(n_prev, n_features)))
  model.add(RepeatVector(n_fut))
  model.add(LSTM(100, activation='relu', return_sequences=True))
  model.add(TimeDistributed(Dense(50, activation='relu')))
  model.add(TimeDistributed(Dense(1)))
  model.compile(loss='mean_squared_error', optimizer='adam')

  model.summary()

  return model


# Execute 2

In [None]:
train = train_data.drop(["TVOLSP","Return","Return5h","Return1h", "Return30min","Trend5h","Trend1h", "Trend30min"],1)

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train)
y=train['SPX'].to_numpy().reshape(-1,1)
scaleY = MinMaxScaler()
scaleY.fit_transform(y)

n_prev, n_features, n_fut = 500, 5, 100
epochs, batch_size = 1, 256

train_x, train_y = create_time_series(scaled_data, scaled_data[:,0], n_prev, n_fut)
# fit network
model = phase2_lstm()
model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=1)


In [None]:
# Plots 3x3 demo plots of projected vs ground truth
test_x, test_y = train_x[6000:], train_y[6000:]

y_hat = scaleY.inverse_transform(model.predict(test_x[0].reshape(1,n_prev,-1))[0])
scaleY.inverse_transform(train_y[0].reshape(-1,1))

plt.figure(figsize=(10,8))

x_orig = scaler.inverse_transform(train_x[0])[:,0]

offset=1000
for i in range(9):
  plt.subplot(3,3,i+1)
  i = i * n_prev
  y_pred = scaleY.inverse_transform(model.predict(train_x[i+offset].reshape(1,n_prev,-1))[0])
  plt.plot(np.arange(i,i+n_prev+20),np.array(train_data['SPX'])[i+offset:i+n_prev+20+offset], 'c--')
  plt.plot(np.arange(i+n_prev,i+n_prev+20),y_pred[:20],'r')





# Phase 2.5: Conv2D+LSTM model declaration

In [None]:
def phase2_5convlstm():

  model = Sequential()
  model.add(Conv1D(32,3,activation='relu',input_shape=(n_timesteps, n_features)))
  # model.add(LSTM(200, activation='relu', return_sequences=True))
  model.add(Flatten())
  model.add(RepeatVector(n_outputs))
  model.add(LSTM(50, activation='relu', return_sequences=True))
  model.add(TimeDistributed(Dense(100, activation='relu')))
  model.add(TimeDistributed(Dense(1)))
  model.compile(loss='mse', optimizer='adam')

  model.summary()

  return model

# Execute 2.5

In [None]:
phase2_5convlstm()