## Import

In [1]:
import q_learner
import state_string_utils as stringutils
import state_enumerator as stateenum
import NAS 
import netparser
from tensorflow import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from datetime import datetime
import time
import importlib
import os
from sklearn.model_selection import train_test_split

## Loading Data

In [2]:
# Window length between features: number of days
feat_window = 90

# Window length between performance: number of years
pm_window = 3
lb_window = int(3 * pm_window * 365.25) + 1

# Window length between training samples: number of days
sample_window = 30

# Test period start date
test_start_date = '2020-06-30'

# Filepath of dataset
filepath = '/Users/kz_ke/Documents/Masters/Classes/DL/WM-SecuritySelection-main/data/MF_LargeCap_ExcessReturn_3Y.parquet'

In [3]:
def prepare_data_for_er_ari(filepath, label_window=pm_window):
    ''' Cleans data: 
            1.Dropping rows with missing value; 
            2.Dropping columns without data;
            3.Removing columns with available data period shorter than pm_window
        Then sets the data after the performance window as labels
        Returns dictionary of cleaned data and dictionary of labels
    '''
    er_ari_df = pd.read_parquet(filepath)
    

    data_dict = {ticker: er_ari_df[ticker].dropna() for ticker in er_ari_df.columns}
    tickers_to_remove = []
    
    label_dict = {}
    for ticker, series in tqdm(data_dict.items()):
        if series.isna().sum() == series.shape[0]:
            tickers_to_remove += [ticker]
            continue

        last_date = series.index[-1] - relativedelta(years=pm_window)
        if last_date <= series.index[0]:
            tickers_to_remove.append(ticker)
            continue

        index = series.loc[:series.index[-1] - relativedelta(years=pm_window)].index
        label_dict[ticker] = pd.Series([
            series[date + relativedelta(years=pm_window)] for date in index
        ], index=index)
        
    _ = [data_dict.pop(ticker) for ticker in tickers_to_remove]
    
    return data_dict, label_dict

In [4]:
%%time
data_dict, label_dict = prepare_data_for_er_ari(filepath)

100%|██████████████████████████████████████████████████████████████████████████████| 1330/1330 [03:53<00:00,  5.71it/s]

Wall time: 3min 55s





In [5]:
'''Looping through the dictionary of data, lets the data before checkpoint to be training data and the forward to be testing data. 
Gets list of data and labels according to lb_window and feat_window. '''
%%time

tickers = list(data_dict.keys())

train_data = []
train_labels = []

test_data = []
test_labels = []

# test start date
checkpoint = datetime.strptime(test_start_date, '%Y-%m-%d') - relativedelta(years=pm_window)

for ticker in tqdm(tickers):    
    label = label_dict[ticker]
    if label.shape[0] == 0:
        continue
    ts = data_dict[ticker].loc[:label.index[-1]]

    indices = [np.arange(i, i+lb_window, feat_window) for i in range(0, ts.shape[0] - lb_window + 1, sample_window)]
    
    temp_data = np.array([ts.iloc[sub_indices].values for sub_indices in indices])
    if temp_data.shape[0] == 0:
        continue
    temp_labels = np.array([label.loc[ts.index[sub_indices[-1]]] for sub_indices in indices])
    
    train_indices = [idx for idx in range(temp_data.shape[0]) if ts.index[indices[idx][-1]] <= checkpoint]
    test_indices = [idx for idx in range(temp_data.shape[0]) if ts.index[indices[idx][-1]] > checkpoint]
    
    train_data += [temp_data[train_indices]] 
    train_labels += [temp_labels[train_indices]]
    
    test_data += [temp_data[test_indices]] 
    test_labels += [temp_labels[test_indices]]



100%|█████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:09<00:00, 124.84it/s]

Wall time: 9.06 s





In [6]:
def reshape_input_data(x=None, y=None):
    '''A helper function that reshapes the input data if needed'''
    if x is not None:
        if len(x.shape) == 2:
            x = x.reshape(-1, 1, x.shape[1])
        elif len(x.shape) == 3:
            x = x.reshape(-1, x.shape[2], x.shape[1])
        else:
            raise ValueError('Invalid x shape: {}'.format(x.shape))

    if y is not None:
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        elif len(y.shape) == 2:
            pass
        else:
            raise ValueError('Invalid y shape: {}'.format(y.shape))

    if x is None and y is None:
        return None
    elif x is None and y is not None:
        return y
    elif x is not None and y is None:
        return x
    else:
        return x, y

In [7]:
'''
Reshapes and gets x_train, x_test, y_train, y_test
Then uses the first 90% of train data to be the actual training dataset, and the final 10% to be the validation dataset.
'''
%%time
x_train = np.concatenate(train_data)[:, :, np.newaxis]
y_train = np.concatenate(train_labels)[:, np.newaxis]
x_test = np.concatenate(test_data)[:, :, np.newaxis]
y_test = np.concatenate(test_labels)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9)

Wall time: 9.99 ms


## Running NAS

In [11]:
'''Initialize directory'''
path = '/Users/kz_ke/Documents/Masters/Classes/DL/AutoML3/mylogs'
isExist = os.path.exists(path)
if not isExist:
    os.makedirs(path)
    print("The new directory is created!")

In [46]:
'''Import necessary parameters and the module that generates networks'''
_model = __import__('models.AutoML',
                    globals(),
                    locals(),
                    ['state_space_parameters', 'hyper_parameters'], 
                    0)

factory = NAS.NAS('mylogs',
                  _model.state_space_parameters,
                  _model.hyper_parameters,
                  1,
                  0.5)

In [65]:
'''
At each iteration, gets loss, RMSE and accuracy, then stores the trained network to replay database for future use
Training stops when the monitored 'val_loss' has stopped improving.
'''
for runtimes in range(50):
    net, i = factory.generate_new_network()
    print(net)
    p= netparser.parse('net', net)
    newnet = netparser.parse_network_structure(p)
    model = keras.Sequential(newnet)
    
    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights = True)
    model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=1e-4),
                loss='mean_squared_error',
                metrics=[keras.metrics.RootMeanSquaredError()]
            )
    
    history = model.fit(x_train, y_train, batch_size = 40, epochs =1, callbacks=[callback], validation_data=(x_val, y_val))
    
    bestval = model.evaluate(x_val, y_val)[1]
    
    factory.incorporate_trained_net(net, bestval, 1, [i])

[RNN(30,sigmoid), TERMINATE]
Incorporated net, acc: 0.069538, net: [RNN(30,sigmoid), TERMINATE]
[LSTM(30,sigmoid), D(0.300000), LSTM(20,leaky_relu), FC(30, tanh), D(0.400000), TERMINATE]
Incorporated net, acc: 0.072472, net: [LSTM(30,sigmoid), D(0.300000), LSTM(20,leaky_relu), FC(30, tanh), D(0.400000), TERMINATE]
[RNN(30,leaky_relu), LSTM(30,tanh), TERMINATE]
Incorporated net, acc: 0.065860, net: [RNN(30,leaky_relu), LSTM(30,tanh), TERMINATE]
[LSTM(20,relu), FC(30, relu), D(0.300000), TERMINATE]
Incorporated net, acc: 0.066874, net: [LSTM(20,relu), FC(30, relu), D(0.300000), TERMINATE]
[GRU(20,tanh), D(0.300000), LSTM(30,leaky_relu), FC(10, tanh), D(0.500000), TERMINATE]
Incorporated net, acc: 0.067618, net: [GRU(20,tanh), D(0.300000), LSTM(30,leaky_relu), FC(10, tanh), D(0.500000), TERMINATE]
[LSTM(10,tanh), LSTM(30,sigmoid), LSTM(30,leaky_relu), BILSTM(30,relu), TERMINATE]
Incorporated net, acc: 0.069860, net: [LSTM(10,tanh), LSTM(30,sigmoid), LSTM(30,leaky_relu), BILSTM(30,relu), T

Incorporated net, acc: 0.070916, net: [GRU(10,sigmoid), FC(10, leaky_relu), TERMINATE]
[LSTM(10,leaky_relu), D(0.400000), TERMINATE]
Incorporated net, acc: 0.069819, net: [LSTM(10,leaky_relu), D(0.400000), TERMINATE]
[BILSTM(10,leaky_relu), TERMINATE]
Incorporated net, acc: 0.069156, net: [BILSTM(10,leaky_relu), TERMINATE]
[BILSTM(30,leaky_relu), TERMINATE]
Incorporated net, acc: 0.065419, net: [BILSTM(30,leaky_relu), TERMINATE]
[LSTM(10,leaky_relu), RNN(10,relu), BILSTM(30,relu), RNN(10,leaky_relu), LSTM(30,sigmoid), TERMINATE]
Incorporated net, acc: 0.067539, net: [LSTM(10,leaky_relu), RNN(10,relu), BILSTM(30,relu), RNN(10,leaky_relu), LSTM(30,sigmoid), TERMINATE]
[GRU(10,relu), D(0.400000), TERMINATE]
Incorporated net, acc: 0.069514, net: [GRU(10,relu), D(0.400000), TERMINATE]
[BILSTM(30,leaky_relu), GRU(10,relu), LSTM(20,leaky_relu), LSTM(30,leaky_relu), TERMINATE]
Incorporated net, acc: 0.066816, net: [BILSTM(30,leaky_relu), GRU(10,relu), LSTM(20,leaky_relu), LSTM(30,leaky_relu), 

Incorporated net, acc: 0.071983, net: [GRU(10,sigmoid), RNN(30,leaky_relu), RNN(10,leaky_relu), BILSTM(30,relu), GRU(10,leaky_relu), TERMINATE]
[LSTM(30,sigmoid), BILSTM(30,tanh), TERMINATE]
Incorporated net, acc: 0.071564, net: [LSTM(30,sigmoid), BILSTM(30,tanh), TERMINATE]
[LSTM(30,leaky_relu), TERMINATE]
Incorporated net, acc: 0.066624, net: [LSTM(30,leaky_relu), TERMINATE]
[GRU(30,leaky_relu), TERMINATE]
Incorporated net, acc: 0.066534, net: [GRU(30,leaky_relu), TERMINATE]
[BILSTM(10,tanh), FC(30, leaky_relu), TERMINATE]
Incorporated net, acc: 0.066159, net: [BILSTM(10,tanh), FC(30, leaky_relu), TERMINATE]
[LSTM(10,tanh), D(0.100000), LSTM(30,leaky_relu), D(0.400000), GRU(10,sigmoid), D(0.200000), TERMINATE]
Incorporated net, acc: 0.074283, net: [LSTM(10,tanh), D(0.100000), LSTM(30,leaky_relu), D(0.400000), GRU(10,sigmoid), D(0.200000), TERMINATE]
[LSTM(10,leaky_relu), BILSTM(30,leaky_relu), GRU(30,relu), TERMINATE]
Incorporated net, acc: 0.067299, net: [LSTM(10,leaky_relu), BILSTM

## Step by Step Running

In [92]:
net, i = factory.generate_new_network()
print(net)

[RNN(100,sigmoid), GRU(100,tanh), LSTM(200,linear), FC(50, relu), TERMINATE]


In [121]:
p= netparser.parse('net', net)
newnet = netparser.parse_network_structure(p)

In [122]:
newnet

[<keras.layers.recurrent.SimpleRNN at 0x201f6eb2a90>,
 <keras.layers.recurrent_v2.GRU at 0x201f6ebef40>,
 <keras.layers.recurrent_v2.LSTM at 0x201ed672310>,
 <keras.layers.core.Dense at 0x201ed6726d0>,
 <keras.layers.core.Dense at 0x20221112ac0>]

In [128]:
model = keras.Sequential(newnet)

In [213]:
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights = True)

In [129]:
model.compile(
            optimizer=keras.optimizers.Adam(lr=1e-4),
            loss='mean_squared_error',
            metrics=[keras.metrics.RootMeanSquaredError()]
        )

In [214]:
model.fit(x_train, y_train, batch_size = 40, epochs =5, callbacks=[callback], validation_data=(x_val, y_val))



<keras.callbacks.History at 0x201e8f3e640>

In [182]:
bestval = model.evaluate(x_test, y_test)[1]



In [61]:
factory.incorporate_trained_net(net, bestval, 1, [1])

Incorporated net, acc: 0.069071, net: [GRU(100,linear), BILSTM(150,relu), FC(50, leaky_relu), FC(1, leaky_relu)]


In [62]:
factory.replay_dictionary

Unnamed: 0,net,accuracy_best_val,epsilon,iteration
0,"[GRU(100,linear), BILSTM(150,relu), FC(50, lea...",0.069071,1,1
