In [1]:
from model_training_utils import RNNDataFeeder, ModelConfig, read_data, load_data_columns_config
from train_models import fit_models_with_cross_validation, make_plot
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
from keras import backend as K
import numpy as np
import pickle
from tqdm import tqdm

GPU devices are already configured, skipping setup.


### Target:
##### [1]. Preparation
Prepare the parameters for RNN models building and training
##### [2]. Train models
1. Fit models by cumulatively adding predictor sets
2. Fit models by each predictor set (factors_columns)

### [1]. Preparation
Prepare the parameters for the model building and training

In [2]:
# Step 1: Load data
folder_path = "F:/predictors"
final_dataset = read_data(filename="final_dataset", folder_path=folder_path).drop(columns=["lag_≥5"])
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)

config_dict = load_data_columns_config()

# Show the content of config_dict:
print("config_dict has keys: ", config_dict.keys())

# Get the factor columns from config_dict
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

config_dict has keys:  dict_keys(['index_columns', 'tech_factors', 'calendar_factors', 'fundamental_factors', 'industry_factors', 'release_schedule_factors', 'output_columns'])


In [4]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# test_dates are the dates used for testing (out-of-sample datasets)
test_dates = dates_list[trainval_test_threshold:]
# Create the test_filter, an input for model training.
test_filter = (final_dataset["date"] >= test_dates[0]) & (
    final_dataset["date"] <= test_dates[-1]
)
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

In [5]:
# Step 3: set parameters for models 
# predictors_size: the number of predictors
# win_size (for RNN models, win_size>=1): the window size of each data point

def create_rnn_model(win_size, predictors_size):
    inputs = tf.keras.layers.Input(shape=(win_size,predictors_size))
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(inputs)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

In [6]:
# sample dense model structure:
create_rnn_model(10,104).summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10, 104)]         0         
                                                                 
 lstm (LSTM)                 (None, 32)                17536     
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 18,209
Trainable params: 18,209
Non-trainable params: 0
_________________________________________________________________


### [2]. Train models
##### 1. Fit models by cumulatively adding predictor sets

In [None]:
# Step 4: Train models
# test how the different groups of features have impact on the 
input_columns = ['isin']
num = 1
# runtime: ~ 17.5 hours
for factor in tqdm(factors_columns):
    # input_columns are "isin" + the column names of predcitors in this model
    if num == 5:
        input_columns = input_columns + config_dict[factor][:-1]
    else:
        input_columns = input_columns + config_dict[factor]
    # data_columns includes "date" + input_columns and response variable column name
    data_columns = ["date"] + input_columns + ["log_adj_volume"]
    # The data feeder of RNN models:
    # (1). it needs ISIN  and date columns. The 1st 2 columns of data_df are ["date", "isin"].
    # (2). the data_df should be sorted by ["date", "isin"]
    # (3). the last column of data_df should be the response variable column name
    # (4). it only supports response variable in shape of (1, 1)
    # (5). it supports window_size >= 1
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns)-1, 
                                predictors_dates=final_dataset['date'])
    # The configuration of the model:
    # (1). model_name: the name of the model
    # (2). create_dense_model: a function to generate a model structure
    # (3). other parameters: 
    #      verbose: verbose during model training 
    #      lr: learning rate
    model_config = ModelConfig(model_name=f"lstm_{num}_tp", 
                               model_structure=create_rnn_model, 
                               verbose=0, lr=0.001)
    
    # Set seed for reproducing the result
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"lstm_{num}"
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_lstm_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_lstm_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config
    num += 1

##### 2. Fit models by each predictor set (factors_columns)

In [9]:
# 2.2 Fit models by each predictor set (factors_columns)
# runtime: ~5.5 hours
num = 2
# rerun num=3 the last cv
for factor in tqdm(factors_columns[1:]):
    # input_columns are "isin" + the column names of predcitors in this model
    if num == 5:
        input_columns = ['isin'] + config_dict[factor][:-1]
    else:
        input_columns = ['isin'] + config_dict[factor]
    # data_columns includes input_columns and response variable column name
    data_columns = ["date"] + input_columns + ["log_adj_volume"]
    
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns)-1, 
                                predictors_dates=final_dataset['date'])
    model_config = ModelConfig(model_name=f"lstm_{num}_tp_sc", 
                               model_structure=create_rnn_model, 
                               verbose=0, lr=0.001)
    
    # Set seed for reproducing the result
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"lstm_single_cate_{num}"
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_lstm_single_cate_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_lstm_single_cate_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    num += 1
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
[A [00:00, ?it/s]




[A [23:29, 1409.54s/it]




[A [44:07, 1308.86s/it]




[A [1:04:43, 1275.34s/it]




[A [1:27:35, 1313.51s/it]




[A [1:48:14, 1286.63s/it]




6it [2:08:45, 1287.52s/it]
 50%|███████████████████████████████████████                                       | 1/2 [2:08:46<2:08:46, 7726.25s/it]
[A [00:00, ?it/s]




[A [16:29, 989.07s/it]




[A [32:56, 987.86s/it]




[A [54:26, 1125.85s/it]




[A [1:16:16, 1198.71s/it]




[A [1:33:10, 1132.21s/it]




6it [1:50:31, 1105.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [3:59:17<00:00, 7178.97s/it]
