### Rerun the models with model structure enhancement

In [1]:
import sys
sys.path.append("..")

from src.model_training_utils import RNNDataFeeder, RnnAEDataFeeder, ModelConfig, read_data, load_data_columns_config
from src.train_models import setup_gpu, fit_models_with_cross_validation, fit_models_with_cross_validation_v2, update_pretrain_filter, train_NN
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
import numpy as np
import pickle
from tqdm import tqdm

# Setup GUP within this script
setup_gpu()

GPU devices are already configured, skipping setup.


### [1]. Preparation
Prepare the parameters for the model building and training

In [2]:
# Step 1: Load data
folder_path = "F:/predictors_v2"
# pre_train_dataset
pre_train_dataset = read_data(filename="pre_train_dataset", folder_path=folder_path)
pre_train_dataset = pre_train_dataset.sort_values(by=["date", "isin"], ignore_index=True)

In [49]:
# Step 3: set parameters for models 
# predictors_size: the number of predictors
# win_size (for RNN models, win_size>=1): the window size of each data point

def create_lstm_autoencoder(win_size, predictors_size, latent_dim=50):
    # Encoder
    inputs = tf.keras.layers.Input(shape=(win_size, predictors_size))
    # GRU Encoder Layer (bottleneck layer)
    encoded = tf.keras.layers.LSTM(latent_dim, 
                                   recurrent_regularizer=tf.keras.regularizers.l1(0.0001),
                                   return_sequences=False)(inputs)
    # Repeat Latent Vector to match the original sequence length (needed for the decoder)
    decoder_input = tf.keras.layers.RepeatVector(win_size)(encoded)
    # GRU Decoder Layer to reconstruct the input sequence
    decoder = tf.keras.layers.LSTM(predictors_size, return_sequences=True)(decoder_input)
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(predictors_size))(decoder)
    # Create the model
    autoencoder = tf.keras.Model(inputs=inputs, outputs=output)
    return autoencoder


def create_rnn_model(encoder, win_size, other_input_size):
    """transfer learning version"""
    inputs = encoder.input
    transfer_layer=encoder.output
    if other_input_size>0:
        inputs = tf.keras.layers.Input(shape=(win_size, other_input_size + encoder.input_shape[-1]),
                                       name='full_input')
        # Split dataset
        other_input = tf.keras.layers.Lambda(lambda x: x[:, :, :other_input_size])(inputs)
        # encoder datafeeder should be the last columns
        encoder_input = tf.keras.layers.Lambda(lambda x: x[:, :, other_input_size:])(inputs)
        encoder_output = encoder(encoder_input)
        transfer_layer = tf.keras.layers.Concatenate()([other_input, encoder_output])
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(transfer_layer)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

def create_rnn_model_normal(win_size, predictors_size):
    inputs = tf.keras.layers.Input(shape=(win_size,predictors_size))
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(inputs)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

In [4]:
# Load new config
config_dict = load_data_columns_config(version=2)
# Show the content of config_dict:
print("config_dict has keys: ", config_dict.keys())

final_dataset = read_data(filename="final_dataset",
                          folder_path=folder_path)
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)


# Get the factor columns from config_dict
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

config_dict has keys:  dict_keys(['index_columns', 'tech_factors', 'calendar_factors', 'fundamental_factors', 'industry_factors', 'agg_industry_factors', 'release_schedule_factors', 'output_columns'])


In [5]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# test_dates are the dates used for testing (out-of-sample datasets)
test_dates = dates_list[trainval_test_threshold:]
# Create the test_filter, an input for model training.
test_filter = (final_dataset["date"] >= test_dates[0]) & (
    final_dataset["date"] <= test_dates[-1]
)
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

In [6]:
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

### [2]. Train models
##### 1. Fit models by both predictor and model struture enhancement

In [None]:
input_columns = list(pre_train_dataset.columns[2:])
for num in [3, 4, 5]:
    # The data feeder of RNN autoencoder models:
    # (1). it needs ISIN  and date columns. The 1st 2 columns of data_df are ["date", "isin"].
    # (2). the data_df should be sorted by ["date", "isin"]
    # (5). it supports window_size >= 1
    ae_data_feeder = RnnAEDataFeeder(data_df=pre_train_dataset, 
                                     window_size=10, 
                                     batch_size=1024,
                                     predictors_size = len(input_columns), 
                                     predictors_dates=pre_train_dataset['date'])
    
    
    if num==3:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        input_columns + ["log_adj_volume"]
    elif num==4:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        config_dict['industry_factors'] + input_columns + ["log_adj_volume"]
    else:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        config_dict['industry_factors'] + config_dict['release_schedule_factors'] + input_columns + ["log_adj_volume"]
    # The data feeder of RNN models:
    # (1). it needs ISIN  and date columns. The 1st 2 columns of data_df are ["date", "isin"].
    # (2). the data_df should be sorted by ["date", "isin"]
    # (3). the last column of data_df should be the response variable column name
    # (4). it only supports response variable in shape of (1, 1)
    # (5). it supports window_size >= 1
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(data_columns)-3, 
                                predictors_dates=final_dataset['date'])
    # The configuration of the model:
    # (1). model_name: the name of the model
    # (2). create_dense_model: a function to generate a model structure
    # (3). other parameters: 
    #      verbose: verbose during model training 
    #      lr: learning rate
    model_config = ModelConfig(model_name=f"lstm_l1_{num}_ae_v4", # rerun using v6 tag
                               model_structure=create_rnn_model, 
                               verbose=0, lr=0.001)
    # model configuration of the pretrain autoencoder
    ae_model_config = ModelConfig(model_name=f"ae_lstm_l1_{num}_ae_v4",  # rerun using v6 tag
                                  model_structure=create_lstm_autoencoder, 
                                  verbose=0, lr=0.001, encoder_trainable=True)
    
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    # runtime: 4 hs 25 mins
    train_metrics_dict, test_metrics, ae_metrics = fit_models_with_cross_validation_v2(
        data_feeder=data_feeder,
        ae_data_feeder=ae_data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        ae_model_config=ae_model_config,
        model_name=f"lstm_l1_v4_ae_{num}" # rerun using v6 tag
    )
    del train_metrics_dict, test_metrics, ae_metrics, ae_data_feeder, data_feeder

##### 2. Fit models to test...
(1). trainable vs non-trainable <br>
(2). other factor sets combination

In [7]:
# lstm_l1_3_ae_test2_v6: tech+cal+fun/ encoder trainable
# lstm_l1_3_ae_test3_v6: tech+fun/ encoder non-trainable
# lstm_l1_3_ae_test4_v6: tech+fun/ encoder trainable
# lstm_l1_3_ae_test5_v6: tech+cal+fun+release/ encoder trainable

In [10]:
num=3

for test_num in [2,3,4,5]:
    input_columns = list(pre_train_dataset.columns[2:])
    # The data feeder of RNN autoencoder models:
    # (1). it needs ISIN  and date columns. The 1st 2 columns of data_df are ["date", "isin"].
    # (2). the data_df should be sorted by ["date", "isin"]
    # (5). it supports window_size >= 1
    ae_data_feeder = RnnAEDataFeeder(data_df=pre_train_dataset, 
                                     window_size=10, 
                                     batch_size=1024,
                                     predictors_size = len(input_columns), 
                                     predictors_dates=pre_train_dataset['date'])
    
    if test_num ==5:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        config_dict['release_schedule_factors'] + input_columns + ["log_adj_volume"]
    elif test_num ==2:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + input_columns + ["log_adj_volume"]
    elif test_num in [3,4]:
        data_columns = ["date", "isin"] + config_dict['tech_factors'] + input_columns + ["log_adj_volume"]
    # The data feeder of RNN models:
    # (1). it needs ISIN  and date columns. The 1st 2 columns of data_df are ["date", "isin"].
    # (2). the data_df should be sorted by ["date", "isin"]
    # (3). the last column of data_df should be the response variable column name
    # (4). it only supports response variable in shape of (1, 1)
    # (5). it supports window_size >= 1
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(data_columns)-3, 
                                predictors_dates=final_dataset['date'])
    # The configuration of the model:
    # (1). model_name: the name of the model
    # (2). create_dense_model: a function to generate a model structure
    # (3). other parameters: 
    #      verbose: verbose during model training 
    #      lr: learning rate
    model_config = ModelConfig(model_name=f"lstm_l1_{num}_ae_test{test_num}_v6_3", 
                               model_structure=create_rnn_model, 
                               verbose=0, lr=0.001)
    
    # model configuration of the pretrain autoencoder
    if test_num == 3:
        ae_model_config = ModelConfig(model_name=f"ae_lstm_l1_{num}_ae_test{test_num}_v6_3", 
                                      model_structure=create_lstm_autoencoder, 
                                      verbose=0, lr=0.001, encoder_trainable=False)
    else:
        ae_model_config = ModelConfig(model_name=f"ae_lstm_l1_{num}_ae_test{test_num}_v6_3", 
                                      model_structure=create_lstm_autoencoder, 
                                      verbose=0, lr=0.001, encoder_trainable=True)
    
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    # runtime: 4 hs 25 mins
    train_metrics_dict, test_metrics, ae_metrics = fit_models_with_cross_validation_v2(
        data_feeder=data_feeder,
        ae_data_feeder=ae_data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        ae_model_config=ae_model_config,
        model_name=f"lstm_l1_v6_ae_test{test_num}_{num}",
        skip_cv_list=None,
    )
    del train_metrics_dict, test_metrics, ae_metrics, ae_data_feeder, data_feeder

0it [00:00, ?it/s]

2019-01-03 2020-01-02 len=252


2it [00:01,  1.32it/s]

2019-04-01 2020-03-30 len=252


3it [00:02,  1.01it/s]

2019-06-26 2020-06-24 len=252
2019-09-20 2020-09-18 len=252


5it [1:06:58, 978.03s/it] 

2019-12-16 2020-12-14 len=252


6it [1:07:00, 670.03s/it]

2020-03-13 2021-03-12 len=252



