### Rerun the models with predictor enhancement

In [None]:
import sys
sys.path.append("..")

from src.model_training_utils import RNNDataFeeder, RnnAEDataFeeder, ModelConfig, read_data, load_data_columns_config
from src.train_models import setup_gpu, fit_models_with_cross_validation, fit_models_with_cross_validation_v2
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
import numpy as np
import pickle
from tqdm import tqdm

# Setup GUP within this script
setup_gpu()

### [1]. Preparation
Prepare the parameters for the model building and training

In [2]:
# Step 1: Load data
folder_path = "F:/predictors_v2"
# pre_train_dataset
pre_train_dataset = read_data(filename="pre_train_dataset", folder_path=folder_path)
pre_train_dataset = pre_train_dataset.sort_values(by=["date", "isin"], ignore_index=True)

In [3]:
# Step 3: set parameters for models 
# predictors_size: the number of predictors
# win_size (for RNN models, win_size>=1): the window size of each data point

def create_rnn_model_normal(win_size, predictors_size):
    inputs = tf.keras.layers.Input(shape=(win_size,predictors_size))
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(inputs)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

In [4]:
# Load new config
config_dict = load_data_columns_config(version=2)
# Show the content of config_dict:
print("config_dict has keys: ", config_dict.keys())

final_dataset = read_data(filename="final_dataset",
                          folder_path=folder_path)
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)

# Get the factor columns from config_dict
# v2
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']
# v3
# factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
#                  'agg_industry_factors', 'release_schedule_factors']

config_dict has keys:  dict_keys(['index_columns', 'tech_factors', 'calendar_factors', 'fundamental_factors', 'industry_factors', 'agg_industry_factors', 'release_schedule_factors', 'output_columns'])


In [5]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# test_dates are the dates used for testing (out-of-sample datasets)
test_dates = dates_list[trainval_test_threshold:]
# Create the test_filter, an input for model training.
test_filter = (final_dataset["date"] >= test_dates[0]) & (
    final_dataset["date"] <= test_dates[-1]
)
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

### [2]. Train models
##### 1. Fit models by cumulatively adding predictor sets

In [15]:
tf.random.set_seed(1234)

num=1
input_columns=[]
for factor in factors_columns:
    input_columns = input_columns + config_dict[factor]
    data_columns = ["date", "isin"] + input_columns + ["log_adj_volume"]
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns), 
                                predictors_dates=final_dataset['date'])

    model_config = ModelConfig(model_name=f"lstm_{num}_tp_v2", 
                               model_structure=create_rnn_model_normal, 
                               verbose=0, lr=0.001)

    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reduc er 
    # runtime: 1:55
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"lstm_v2_{num}"
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_lstm_v2_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_lstm_v2_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config
    num += 1

0it [00:00, ?it/s]



1it [14:56, 896.09s/it]



2it [31:50, 965.59s/it]



3it [45:27, 898.02s/it]



4it [1:06:23, 1039.35s/it]



5it [1:23:30, 1034.66s/it]



6it [1:40:16, 1002.73s/it]
0it [00:00, ?it/s]



1it [15:15, 915.28s/it]



2it [33:10, 1009.20s/it]



3it [48:36, 971.50s/it] 



4it [1:06:21, 1008.11s/it]



5it [1:24:14, 1031.81s/it]



6it [1:41:38, 1016.36s/it]


##### 2. Fit models by each predictor set (factors_columns)

In [8]:
num = 1
# runtime: 6.5 hours
for factor in tqdm(factors_columns):
    # input_columns are the column names of predcitors in this model
    input_columns = config_dict[factor]
    data_columns = ["date", "isin"] + input_columns + ["log_adj_volume"]
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns), 
                                predictors_dates=final_dataset['date'])

    model_config = ModelConfig(model_name=f"lstm_{num}_tp_sc_v2", 
                               model_structure=create_rnn_model_normal, 
                               verbose=0, lr=0.001) 

    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"lstm_single_cate_v2_{num}" # lstm_v2_{num}
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_lstm_single_cate_v2_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_lstm_single_cate_v2_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config
    num += 1

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
[A [00:00, ?it/s]




[A [08:55, 535.86s/it]




[A [17:38, 528.04s/it]




[A [26:20, 525.21s/it]




[A [35:06, 525.74s/it]




[A [43:50, 525.04s/it]




6it [52:41, 526.88s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [52:41<00:00, 3161.59s/it]
