In [None]:
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, UpSampling1D, BatchNormalization, LSTM, RepeatVector
from keras.models import Sequential, Model
from keras.models import model_from_json
from tensorflow import keras
from keras import regularizers
import datetime
import time
import requests as req
import json
import pandas as pd
import pickle
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1
from tensorflow.keras.losses import MeanSquaredError
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
%pylab inline

In [None]:
## USE: full log(returns)/returns dataframe
## Risk Adjusted Returns

df = pd.read_pickle("../Data/risk_adj_returns.pkl").iloc[1:]
df1 = pd.read_pickle("../Data/returns.pkl").iloc[1:]

drop_columns = []
for col in df1.columns:
    if df1[col].isnull().all() == True:
        drop_columns.append(col)
        
df1.drop(columns=drop_columns, inplace=True)

# df['pct_change'] = df.close.pct_change()
# df['log_ret'] = np.log(df.close) - np.log(df.close.shift(1))
# df = df.dropna()

In [None]:
#df = df.dropna(how='any',axis=0) #All rows have NaN

In [None]:
df1.tail()

In [None]:
def get_investable(t, n_rows):
    "Find stocks in investable universe at time t\
    (stocks in the S&P500 that have prices recorded for the last n_rows days)"
    
    df_investable = df1.copy(deep = True).sort_index(ascending = False)
    
    #add 1 date to get the test features in investable
    t = t + pd.DateOffset(1)
    
    #if t is now a non-trading day, advance until we reach a valid trading day
    while t not in df_investable.index:
        t = t + pd.DateOffset(1)
    
    t_index = df_investable.index.get_loc(t)
    
    #take n_rows worth of data upto time specified
    df_investable = df_investable.iloc[t_index + 1:t_index + n_rows + 1]
    
    #find all stocks that exist in the S&P at this time period
    investable_universe = []
    for col in df_investable.columns:
        if ~df_investable[col].iloc[:n_rows].isna().any():
            investable_universe.append(col)
        
    df_investable = df_investable[investable_universe]
    
    return df_investable

In [None]:
df1 = get_investable(pd.to_datetime('2018-05-11'),500)
df1.head()

In [None]:
tts = train_test_split(df1, test_size=0.2, shuffle=False)
train = tts[0]
test = tts[1]

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
# Feature Scaling
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(train)
test_set_scaled = sc.fit_transform(test)
pd.DataFrame(training_set_scaled).head()

## 1. Simple multi-layer percepetron (MLP) autoencoder

In [None]:
# calculated log returns (i.e. the log of the difference between the price x+1 and price x)
# windows of train.shape[1] consecutive returns will be produced. 
# Can be normalized with a MinMaxScaler to the range [0,1]??

window_length = training_set_scaled.shape[1]
encoding_dim = 20
epochs = 500

# compress the input to a 3-dimensional latent space. 

# input placeholder
input_window = Input(shape=(window_length,))
# encoded representation of the input
encoded = Dense(encoding_dim, activation='tanh')(input_window) #tanh, linear, leakyrelu
# lossy reconstruction of the input
decoded = Dense(window_length, activation='linear')(encoded) #linear

# model mapping an input to its reconstruction
simple_autoencoder = Model(input_window, decoded)

# model mapping an input to its encoded representation
encoder = Model(input_window, encoded)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=5, mode='auto', verbose = 1)
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5",save_best_only=True)

simple_autoencoder.summary()
sae = simple_autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc','mae']) #MSE
sae
history = simple_autoencoder.fit(training_set_scaled, training_set_scaled,
                epochs=epochs,
                batch_size=1024,
                shuffle=True,
                validation_split = 0.2,
                callbacks = [monitor, checkpointer])       
#               validation_data=(test_set_scaled, test_set_scaled))

decoded_stocks = simple_autoencoder.predict(test_set_scaled)

In [None]:
pd.DataFrame(encoder.predict(test_set_scaled)).head()

In [None]:
pd.DataFrame(decoded_stocks).head()

In [None]:
def plot_history(history):
    plt.figure(figsize=(15, 5))
    ax = plt.subplot(1, 4, 1)
    plt.plot(history.history["loss"])
    plt.title("Train loss")
    ax = plt.subplot(1, 4, 2)
    plt.plot(history.history["val_loss"])
    plt.title("Test loss")
    ax = plt.subplot(1, 4, 3)
    plt.plot(history.history["val_acc"])
    plt.title("Accuracy")
    ax = plt.subplot(1, 4, 4)
    plt.plot(history.history["val_mae"])
    plt.title("Mean Absolute Error")

In [None]:
# Loss vs Epoch
plot_history(history)

### 2. 1D convolutional autoencoder
(Kernel size = 3)

In [None]:
# main “event” very well represented while the overall reconstruction is very smooth 

input_window = Input(shape=(window_length,1))
x = Conv1D(16, 3, activation="tanh", padding="same")(input_window) # 10 dims
#x = BatchNormalization()(x)
x = MaxPooling1D(2, padding="same")(x) # 5 dims
x = Conv1D(1, 3, activation="tanh", padding="same")(x) # 5 dims
#x = BatchNormalization()(x)
encoded = MaxPooling1D(2, padding="same")(x) # 3 dims
encoder = Model(input_window, encoded)

# 3 dimensions in the encoded layer

x = Conv1D(1, 3, activation="tanh", padding="same")(encoded) # 3 dims
#x = BatchNormalization()(x)
x = UpSampling1D(2)(x) # 6 dims
x = Conv1D(16, 2, activation='tanh')(x) # 5 dims
#x = BatchNormalization()(x)
x = UpSampling1D(2)(x) # 10 dims
decoded = Conv1D(1, 3, activation='linear', padding='same')(x) # 10 dims
conv_autoencoder = Model(input_window, decoded)
conv_autoencoder.summary()

conv_autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['acc','mae'])
history = conv_autoencoder.fit(training_set_scaled, training_set_scaled,
                epochs=epochs,
                batch_size=1024,
                shuffle=True,
                validation_split = 0.2)

decoded_stocks = conv_autoencoder.predict(test)

In [None]:
plot_history(history)

In [None]:
#pd.DataFrame(encoder.predict(test_set_scaled))

## 3. Multiple Layers w/ L1 Regularization

In [None]:
import tensorflow as tf

def model(optimizer = "Adam", score = "acc"):
    #Input Layer
    input_layer = Input(shape =(df.shape[1], ))
  
    #Encode
    encoded = Dense(100, activation ='tanh',
                    activity_regularizer = regularizers.l1(10e-5))(input_layer)
    encoded = Dense(50, activation ='tanh',
                    activity_regularizer = regularizers.l1(10e-5))(encoded)
    encoded = Dense(25, activation ='tanh',
                    activity_regularizer = regularizers.l1(10e-5))(encoded)
    encoded = Dense(12, activation ='tanh',
                    activity_regularizer = regularizers.l1(10e-5))(encoded)
    encoded = Dense(6, activation ='relu')(encoded)

    #Decoder
    decoded = Dense(12, activation ='linear')(encoded)
    decoded = Dense(25, activation ='linear')(decoded)
    decoded = Dense(50, activation ='linear')(decoded)
    decoded = Dense(100, activation ='linear')(decoded)

    #Output
    output_layer = Dense(df.shape[1], activation ='linear')(decoded)

    #Parameters
    autoencoder = Model(input_layer, output_layer)
    autoencoder.compile(optimizer =optimizer, loss =tf.keras.losses.MeanSquaredError(), metrics = ['acc','mae'])
    
    return autoencoder

In [None]:
# Training the Auto-encoder network
output = model().fit(training_set_scaled, training_set_scaled,
                batch_size = 16, epochs = 100, 
                shuffle = True, validation_split = 0.2)
output

In [None]:
plot_history(output)

## Hyper-parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

model_regressor = KerasRegressor(_____, verbose=1, batch_size=10, epochs=10)
#define the grid search parameters
#dimensions = []
#dropout = []
batch_size = [10,20]
loss = ['mean_squared_error']
optimizer = ['Adam', 'SGD', 'RMSprop']
epochs = [10, 15]
scoring = ['acc']

param_grid = dict(optimizer=optimizer,score = scoring)
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=5, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf6",save_best_only=True)

grid = GridSearchCV(estimator=model_regressor, param_grid=param_grid, 
                    n_jobs=1, cv = 3)

## Error
grid_result = grid.fit(df,df, callbacks=[monitor,checkpointer]) # X, y?

grid_best_parameters = grid_result.best_params_
grid_best_accuracy = grid_result.best_score_

In [None]:
grid_best_parameters

In [None]:
grid_best_accuracy

In [None]:
random_search = RandomizedSearchCV(estimator=model_regressor, param_distributions=param_grid, n_iter=100)
random_search.fit(training_set_scaled, training_set_scaled)

random_best_parameters = grid_result.best_params_
random_best_accuracy = grid_result.best_score_

In [None]:
random_best_parameters

In [None]:
random_best_accuracy

## Notes

- Code's working fine now, there are about 3 different autoencoders here, Simple, Convolutional and one with multiple layers and regularization.
- Performed hyp tuning on the last autoencoder model with multiple layers
- The only thing we need to do is add the parameters in model() and param_grid. At the moment, I've only tried optimizer and score. 
- Makes those changes here: 
    - def model(optimizer = "Adam", score = "acc", epoch = '', batch_size= '',....)
    - param_grid = dict(optimizer=optimizer,score = scoring, epoch = ....)
- Parameters we need to add: no. of Dimensions of our dataframe, loss function, epochs, batch_size, dropout