### Analyze Results - 2

##### Check the feature importance of LSTM
#### [2]. Major contribution factor
##### 1. Individual factor set tes

In [1]:
import pandas as pd
import sys
sys.path.append("..")

from src.model_training_utils import load_data_columns_config
from src.model_analysis import summarize_metrics

config_dict = load_data_columns_config()
factors_columns = list(config_dict.keys())[1:-1]

# Specify the custom order for the 'factor_type'
custom_order = ['train', 'validation', 'test']
multi_idx = pd.MultiIndex.from_product([factors_columns, custom_order], names=['factor_type', 'type'])

GPU devices are already configured, skipping setup.


In [2]:
filename_roots=("train_metrics_dict_lstm_", "test_metrics_lstm_")
lstm_cum_metrics_summary = summarize_metrics(filename_roots, factors_columns)

filename_roots=("train_metrics_dict_dense_", "test_metrics_dense_")
dense_cum_metrics_summary = summarize_metrics(filename_roots, factors_columns)

#### [2]. Major contribution factor
##### 1. Load data and prepare parameters

In [1]:
# get the validation dataset
# shuffle the k-th column value
# make the prediction and test by metrics (MSE) again

In [1]:
import sys
sys.path.append("..")

from src.model_training_utils import RNNDataFeeder, ModelConfig, read_data, load_data_columns_config, load_metrics
from src.train_models import setup_gpu, fit_models_with_cross_validation
from src.model_analysis import load_model, plot_importance, save_plots_to_html
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
from keras import backend as K
import numpy as np
import pickle
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

# Setup GUP within this script
setup_gpu()

GPU devices are already configured, skipping setup.


In [2]:
config_dict = load_data_columns_config()
# Get hte factor columns from config_dict
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

In [3]:
# Step 1: Load data
folder_path = "F:/predictors"
final_dataset = read_data(filename="final_dataset", folder_path=folder_path).drop(columns=["lag_≥5"])
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)

In [4]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

In [5]:
def create_rnn_model(win_size, predictors_size):
    inputs = tf.keras.layers.Input(shape=(win_size,predictors_size))
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(inputs)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

# Only use the last validation split
for _, test_idx in tqdm(cv_spliter.split(train_dates)):
    pass

6it [00:00, ?it/s]


##### 1. Analyze models by cumulatively adding predictor sets

In [98]:
tf.random.set_seed(4321)

input_columns = ['isin']
input_columns = input_columns + config_dict[factors_columns[0]]
num = 2
# runtime: ~ 17.5 hours
for factor in tqdm(factors_columns[1:]):
    # input_columns are "isin" + the column names of predcitors in this model
    if num == 5:
        input_columns = input_columns + config_dict[factor][:-1]
    else:
        input_columns = input_columns + config_dict[factor]
    # data_columns includes input_columns and response variable column name
    data_columns = ["date"] + input_columns + ["log_adj_volume"]
    
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns)-1, 
                                predictors_dates=final_dataset['date'])
    
    checkpoint_path=f"./checkpoints/lstm_{num}_tp_CV6"
    model = load_model(data_feeder.window_size, data_feeder.predictors_size, 
                       checkpoint_path, create_rnn_model)

    val_filter = (data_feeder.predictors_dates>= train_dates[test_idx[0]]) & (
        data_feeder.predictors_dates <= train_dates[test_idx[-1]]
    )

    # Compute "Permutation Feature Importance"
    # https://www.kaggle.com/code/cdeotte/lstm-feature-importance 
    # https://christophm.github.io/interpretable-ml-book/feature-importance.html#feature-importance
    # runtime: 9.4 hours
    fea_import_metrics={}
    valid_ds = data_feeder.gen_tf_dataset(val_filter)
    fea_import_metrics["baseline"] = model.evaluate(valid_ds, verbose=1)
    del valid_ds
    
    for i in range(data_feeder.predictors_size):
        valid_ds_i = data_feeder.gen_tf_dataset(subset_filter=val_filter, column_idx=i)
        fea_import_metrics[input_columns[i+1]] = model.evaluate(valid_ds_i)
        del valid_ds_i
    # Save the metric of this model
    with open(f"./metrics/fea_import_lstm_{num}.pkl", "wb") as pickle_file:
        pickle.dump(fea_import_metrics, pickle_file)
    del data_feeder, model, fea_import_metrics
    num += 1

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]



 25%|████████████████████                                                            | 1/4 [22:15<1:06:45, 1335.12s/it]



 50%|███████████████████████████████████████                                       | 2/4 [1:43:24<1:53:48, 3414.30s/it]



 75%|██████████████████████████████████████████████████████████▌                   | 3/4 [5:12:52<2:06:34, 7594.05s/it]



100%|████████████████████████████████████████████████████████████████████████████████| 4/4 [9:21:11<00:00, 8417.82s/it]


##### 2. Analyze models by each predictor set (factors_columns)

In [6]:
tf.random.set_seed(4321)

num = 1
# runtime: 1.8 hours
for factor in tqdm(factors_columns):
    # input_columns are "isin" + the column names of predcitors in this model
    if num == 5:
        input_columns = ['isin'] + config_dict[factor][:-1]
    else:
        input_columns = ['isin'] + config_dict[factor]
    # data_columns includes input_columns and response variable column name
    data_columns = ["date"] + input_columns + ["log_adj_volume"]
    
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(input_columns)-1, 
                                predictors_dates=final_dataset['date'])
    
    checkpoint_path=f"./checkpoints/lstm_{num}_tp_sc_CV6"
    model = load_model(data_feeder.window_size, data_feeder.predictors_size, 
                       checkpoint_path, create_rnn_model)

    val_filter = (data_feeder.predictors_dates>= train_dates[test_idx[0]]) & (
        data_feeder.predictors_dates <= train_dates[test_idx[-1]]
    )

    # Compute "Permutation Feature Importance"
    # https://www.kaggle.com/code/cdeotte/lstm-feature-importance 
    # https://christophm.github.io/interpretable-ml-book/feature-importance.html#feature-importance
    # runtime: 1.8 hours
    fea_import_metrics={}
    valid_ds = data_feeder.gen_tf_dataset(val_filter)
    fea_import_metrics["baseline"] = model.evaluate(valid_ds, verbose=1)
    del valid_ds
    
    for i in range(data_feeder.predictors_size):
        valid_ds_i = data_feeder.gen_tf_dataset(subset_filter=val_filter, column_idx=i)
        fea_import_metrics[input_columns[i+1]] = model.evaluate(valid_ds_i)
        del valid_ds_i
    # Save the metric of this model
    with open(f"./metrics/fea_import_lstm_single_cate_{num}.pkl", "wb") as pickle_file:
        pickle.dump(fea_import_metrics, pickle_file)
    del data_feeder, model, fea_import_metrics
    num += 1

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]



 25%|████████████████████▊                                                              | 1/4 [04:56<14:48, 296.30s/it]



 50%|█████████████████████████████████████████                                         | 2/4 [30:48<34:29, 1034.85s/it]



 75%|████████████████████████████████████████████████████████████                    | 3/4 [1:30:50<36:47, 2207.34s/it]



100%|████████████████████████████████████████████████████████████████████████████████| 4/4 [1:40:43<00:00, 1510.92s/it]


##### 3. DISPLAY LSTM FEATURE IMPORTANCE

In [106]:
plots = []
num = 1
for factor in tqdm(factors_columns):
    fig = plot_importance(metrics_path=f"./metrics/fea_import_lstm_single_cate_{num}.pkl",
                          factor=factor, num=num)
    plots.append(fig)
    # Increase the counter
    num += 1       
save_plots_to_html(figures=plots, filename="./feature_importance_plots1.html")

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 19.56it/s]


In [107]:
plots = []
num = 1
factors=""
for factor in tqdm(factors_columns):
    factors += f" & {factor}"
    fig = plot_importance(metrics_path=f"./metrics/fea_import_lstm_{num}.pkl",
                          factor=factors, num=num)
    plots.append(fig)
    # Increase the counter
    num += 1       
save_plots_to_html(figures=plots, filename="./feature_importance_plots2.html")

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.72it/s]
