In [1]:
import sys
sys.path.append("..")

from src.model_training_utils import NonRNNDataFeeder, ModelConfig, read_data, load_data_columns_config
from src.train_models import setup_gpu, fit_models_with_cross_validation
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
import numpy as np
import pickle
from tqdm import tqdm

# Setup GUP within this script
setup_gpu()

### Target:
##### [1]. Preparation
Prepare the parameters for dense models building and training
##### [2]. Train models
1. Fit models by cumulatively adding predictor sets
2. Fit models by each predictor set (factors_columns)

### [1]. Preparation
Prepare the parameters for the model building and training

In [2]:
# Step 1: Load data
folder_path = "F:/predictors"
final_dataset = read_data(filename="final_dataset", folder_path=folder_path)
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)
config_dict = load_data_columns_config()
# Show the content of config_dict:
print("config_dict has keys: ", config_dict.keys())

# Get the factor columns from config_dict
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

config_dict has keys:  dict_keys(['index_columns', 'tech_factors', 'calendar_factors', 'fundamental_factors', 'industry_factors', 'release_schedule_factors', 'output_columns'])


In [3]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# test_dates are the dates used for testing (out-of-sample datasets)
test_dates = dates_list[trainval_test_threshold:]
# Create the test_filter, an input for model training.
test_filter = (final_dataset["date"] >= test_dates[0]) & (
    final_dataset["date"] <= test_dates[-1]
)
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

In [4]:
# Step 3: set parameters for models 
# predictors_size: the number of predictors
# win_size (for non-RNN models, win_size=1): the window size of each data point

def create_dense_model(win_size, predictors_size):
    inputs = tf.keras.layers.Input(shape=(win_size, predictors_size))
    layer1 = tf.keras.layers.Dense(32, activation='relu')(inputs)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

In [5]:
create_dense_model(1, 104).summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 104)]          0         
                                                                 
 dense (Dense)               (None, 1, 32)             3360      
                                                                 
 dense_1 (Dense)             (None, 1, 16)             528       
                                                                 
 dense_2 (Dense)             (None, 1, 8)              136       
                                                                 
 dense_3 (Dense)             (None, 1, 1)              9         
                                                                 
Total params: 4,033
Trainable params: 4,033
Non-trainable params: 0
_________________________________________________________________


### [2]. Train models
##### 1. Fit models by cumulatively adding predictor sets

In [6]:
# 2.1. Fit models by cumulatively adding predictor sets
input_columns = []
num = 1
# runtime: ~4:59:37 hours
for factor in tqdm(factors_columns):
    # input_columns are the column names of predcitors in this model
    if num == 5:
        input_columns = input_columns + config_dict[factor][:-1]
    else:
        input_columns = input_columns + config_dict[factor]
    # data_columns includes input_columns and response variable column name
    data_columns = input_columns + ["log_adj_volume"]
    # The data feeder of dense models:
    # (1). it doesn't need ISIN column
    # (2). the data_df should be sorted by ["date", "isin"]
    # (3). the last column of data_df should be the response variable column name
    # (4). it only supports response variable in shape of (1, 1)
    # (5). it only supports window_size=1 (when window_size > 1, the logic is incorrect, 
    # as different ISINs are combined into a single data point, which doesn't make sense.)
    data_feeder = NonRNNDataFeeder(data_df=final_dataset[data_columns], 
                                   window_size=1, 
                                   batch_size=1024,
                                   predictors_size = len(input_columns), 
                                   predictors_dates=final_dataset['date'])
    # The configuration of the model:
    # (1). model_name: the name of the model
    # (2). create_dense_model: a function to generate a model structure
    # (3). other parameters: 
    #      verbose: verbose during model training 
    #      lr: learning rate
    model_config = ModelConfig(model_name=f"dense_{num}_tp", 
                               model_structure=create_dense_model, 
                               verbose=0, lr=0.001)
    
    # Set seed for reproducing the result
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"dense_{num}"
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_dense_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_dense_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config
    num += 1

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]
[A [00:00, ?it/s]




[A [07:53, 473.15s/it]




[A [13:49, 404.51s/it]




[A [19:26, 373.56s/it]




[A [32:31, 536.20s/it]




[A [45:32, 624.34s/it]




6it [51:31, 515.32s/it]
 20%|████████████████                                                                | 1/5 [51:32<3:26:08, 3092.00s/it]
[A [00:00, ?it/s]




[A [07:54, 474.57s/it]




[A [14:02, 412.12s/it]




[A [19:39, 377.68s/it]




[A [25:48, 374.37s/it]




[A [38:00, 503.08s/it]




6it [44:01, 440.32s/it]
 40%|███████████████████████████████▏                                              | 2/5 [1:35:34<2:21:22, 2827.37s/it]
[A [00:00, ?it/s]




[A [07:52, 472.32s/it]




[A [20:32, 641.90s/it]




[A [31:54, 660.14s/it]




[A [45:09, 713.16s/it]




[A [57:34, 724.77s/it]




6it [1:09:25, 694.26s/it]
 60%|██████████████████████████████████████████████▊                               | 3/5 [2:45:00<1:54:37, 3438.74s/it]
[A [00:00, ?it/s]




[A [07:57, 477.04s/it]




[A [20:49, 650.81s/it]




[A [33:44, 707.54s/it]




[A [46:11, 723.28s/it]




[A [59:03, 740.69s/it]




6it [1:11:25, 714.25s/it]
 80%|██████████████████████████████████████████████████████████████▍               | 4/5 [3:56:27<1:02:53, 3773.78s/it]
[A [00:00, ?it/s]




[A [08:33, 513.53s/it]




[A [22:15, 695.00s/it]




[A [33:56, 697.63s/it]




[A [45:27, 694.92s/it]




[A [57:01, 694.72s/it]




6it [1:03:06, 631.05s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 5/5 [4:59:37<00:00, 3595.42s/it]


In [7]:
from keras import backend as K
K.clear_session()

##### 2. Fit models by each predictor set (factors_columns)

In [8]:
# 2.2 Fit models by each predictor set (factors_columns)
num = 2
# runtime: 3:48 hours
for factor in tqdm(factors_columns[1:]):
    # input_columns are the column names of predcitors in this model
    if num == 5:
        input_columns = config_dict[factor][:-1]
    else:
        input_columns = config_dict[factor]
    # data_columns includes input_columns and response variable column name
    data_columns = input_columns + ["log_adj_volume"]
    # data_feeder: the same as 2.1
    data_feeder = NonRNNDataFeeder(data_df=final_dataset[data_columns], 
                                   window_size=1, 
                                   batch_size=1024,
                                   predictors_size = len(input_columns), 
                                   predictors_dates=final_dataset['date'])
    # model_config: the same as 2.1
    model_config = ModelConfig(model_name=f"dense_{num}_tp_sc", 
                               model_structure=create_dense_model, 
                               verbose=0, lr=0.001)

    # Set seed for reproducing the result
    tf.random.set_seed(1234)
    # To train models with cross validation, early stopping and learning rate reducer 
    train_metrics_dict, test_metrics = fit_models_with_cross_validation(
        data_feeder=data_feeder,
        cv_spliter=cv_spliter,
        train_dates=train_dates,
        test_filter=test_filter.values, # test_filter should be a numpy array
        model_config=model_config,
        model_name=f"dense_single_cate_{num}"
    )
    # Save the metric of this model
    with open(f"./metrics/train_metrics_dict_dense_single_cate_{num}.pkl", "wb") as pickle_file:
        pickle.dump(train_metrics_dict, pickle_file)
    with open(f"./metrics/test_metrics_dense_single_cate_{num}.pkl", "wb") as pickle_file:
        pickle.dump(test_metrics, pickle_file)       
    # Release the memory space
    del train_metrics_dict, test_metrics, data_feeder, model_config
    num += 1

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
[A [00:00, ?it/s]




[A [05:40, 340.30s/it]




[A [12:25, 378.19s/it]




[A [17:45, 351.86s/it]




[A [23:54, 358.52s/it]




[A [35:48, 486.86s/it]




6it [47:03, 470.57s/it]
 25%|████████████████████                                                            | 1/4 [47:03<2:21:10, 2823.61s/it]
[A [00:00, ?it/s]




[A [08:51, 531.14s/it]




[A [17:13, 514.02s/it]




[A [31:10, 661.42s/it]




[A [40:57, 632.24s/it]




[A [50:05, 601.78s/it]




6it [59:26, 594.45s/it]
 50%|███████████████████████████████████████                                       | 2/4 [1:46:31<1:48:42, 3261.20s/it]
[A [00:00, ?it/s]




[A [09:26, 566.30s/it]




[A [18:18, 546.35s/it]




[A [33:17, 707.25s/it]




[A [48:47, 795.32s/it]




[A [1:00:18, 757.54s/it]




6it [1:17:27, 774.52s/it]
 75%|██████████████████████████████████████████████████████████▌                   | 3/4 [3:03:59<1:04:54, 3894.41s/it]
[A [00:00, ?it/s]




[A [07:08, 428.70s/it]




[A [13:18, 394.28s/it]




[A [26:05, 564.27s/it]




[A [31:23, 467.24s/it]




[A [38:16, 447.38s/it]




6it [44:23, 443.89s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 4/4 [3:48:22<00:00, 3425.66s/it]
