### Analyze Results -2

##### Check the feature importance of LSTM

In [13]:
import sys
sys.path.append("..")

from src.model_training_utils import RNNDataFeeder, RnnAEDataFeeder, ModelConfig, read_data, load_data_columns_config
from sys.model_analysis import load_model, plot_importance, save_plots_to_html
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
import tensorflow as tf
from keras import backend as K
import numpy as np
import pickle
from tqdm import tqdm
import pandas as pd

config_dict = load_data_columns_config(version=2)
factors_columns = ['tech_factors',
 'calendar_factors',
 'fundamental_factors',
 'industry_factors',
 'release_schedule_factors']

#### [2]. Major contribution factor
##### 1. Load data and prepare parameters

In [2]:
# get the validation dataset
# shuffle the k-th column value
# make the prediction and test by metrics (MSE) again

In [3]:
# Step 1: Load data
folder_path = "F:/predictors_v2"
final_dataset = read_data(filename="final_dataset", folder_path=folder_path)
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)

In [4]:
# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# train_dates are the dates used for training and validation in models.
train_dates = dates_list[:trainval_test_threshold]
# Create cross validation spliter with sliding window (non-cumulative datasets)
cv_spliter = SlidingWindowForecastCV(h=h, step=step, window_size=initial_threshold)

In [5]:
dates_list[trainval_test_threshold], dates_list[-1]

(datetime.date(2022, 3, 25), datetime.date(2023, 12, 29))

In [9]:
for train, validation in cv_spliter.split(train_dates):
    print("train: ", train_dates[train[0]], train_dates[train[-1]])
    print("validation: ", train_dates[validation[0]], train_dates[validation[-1]])

train:  2020-01-03 2020-10-16
validation:  2020-10-19 2021-01-13
train:  2020-03-31 2021-01-13
validation:  2021-01-14 2021-04-12
train:  2020-06-25 2021-04-12
validation:  2021-04-13 2021-07-08
train:  2020-09-21 2021-07-08
validation:  2021-07-09 2021-10-01
train:  2020-12-15 2021-10-01
validation:  2021-10-04 2021-12-28
train:  2021-03-15 2021-12-28
validation:  2021-12-29 2022-03-24


In [10]:
len(train), len(validation)

(200, 60)

In [6]:
def create_lstm_encoder(win_size, predictors_size, latent_dim=50):
    """Create LSTM encoder"""
    # Encoder
    inputs = tf.keras.layers.Input(shape=(win_size, predictors_size))
    # GRU Encoder Layer (bottleneck layer)
    encoded = tf.keras.layers.LSTM(latent_dim, 
                                   recurrent_regularizer=tf.keras.regularizers.l1(0.0001),
                                   return_sequences=False)(inputs)
    # Repeat Latent Vector to match the original sequence length (needed for the decoder)
    output = tf.keras.layers.RepeatVector(win_size)(encoded)
    # Create the model
    encoder = tf.keras.Model(inputs=inputs, outputs=output)
    return encoder


def create_rnn_model(encoder, win_size, other_input_size):
    """transfer learning version"""
    inputs = encoder.input
    transfer_layer=encoder.output
    if other_input_size>0:
        inputs = tf.keras.layers.Input(shape=(win_size, other_input_size + encoder.input_shape[-1]),
                                       name='full_input')
        # Split dataset
        other_input = tf.keras.layers.Lambda(lambda x: x[:, :, :other_input_size])(inputs)
        # encoder datafeeder should be the last columns
        encoder_input = tf.keras.layers.Lambda(lambda x: x[:, :, other_input_size:])(inputs)
        encoder_output = encoder(encoder_input)
        transfer_layer = tf.keras.layers.Concatenate()([other_input, encoder_output])
    layer1 = tf.keras.layers.LSTM(32,
                                  kernel_regularizer=None,
                                  recurrent_regularizer=None)(transfer_layer)
    layer2 = tf.keras.layers.Dense(16, activation='relu')(layer1)
    layer3 = tf.keras.layers.Dense(8, activation='relu')(layer2)
    output = tf.keras.layers.Dense(1)(layer3)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    return model

# Only use the last validation split
for _, test_idx in tqdm(cv_spliter.split(train_dates)):
    pass

6it [00:00, 849.08it/s]


##### 2. Analyze models with different predictor sets

In [8]:
checkpoint_path_dict={1: "./checkpoints/lstm_l1_3_ae_test4_v6_CV6", # tech+fun/trainable
                      2: "./checkpoints/lstm_l1_5_ae_v6_CV6",       # all factors
                      3: "./checkpoints/lstm_l1_3_ae_test5_v6_CV6"} # excluding industry factors

In [9]:
# Get the factor columns from config_dict
factors_columns=['tech_factors', 'calendar_factors', 'fundamental_factors', 
                 'industry_factors', 'release_schedule_factors']

def create_model(win_size, predictors_size, encoder_input_size=len(config_dict["fundamental_factors"])):
    """Create Model based on encoder and LSTM model"""
    encoder = create_lstm_encoder(win_size, encoder_input_size)
    model = create_rnn_model(encoder, win_size, predictors_size-encoder_input_size)
    return model


input_columns=config_dict['fundamental_factors']
for path_num in tqdm([1,2,3]):
    if path_num ==1:
         data_columns = ["date", "isin"] + config_dict['tech_factors'] + input_columns + ["log_adj_volume"]
    elif path_num ==2:
         data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        config_dict['industry_factors'] + config_dict['release_schedule_factors'] + input_columns + ["log_adj_volume"]
    elif path_num ==3:
         data_columns = ["date", "isin"] + config_dict['tech_factors'] + config_dict['calendar_factors'] + \
        config_dict['release_schedule_factors'] + input_columns + ["log_adj_volume"]
    
    data_feeder = RNNDataFeeder(data_df=final_dataset[data_columns], 
                                window_size=10, 
                                batch_size=1024,
                                predictors_size = len(data_columns)-3, 
                                predictors_dates=final_dataset['date'])

    checkpoint_path=checkpoint_path_dict[path_num]
    model = load_model(data_feeder.window_size, data_feeder.predictors_size, 
                       checkpoint_path, create_model)
    
    val_filter = (data_feeder.predictors_dates>= train_dates[test_idx[0]]) & (
        data_feeder.predictors_dates <= train_dates[test_idx[-1]]
    )
    
    # Compute "Permutation Feature Importance"
    # https://www.kaggle.com/code/cdeotte/lstm-feature-importance 
    # https://christophm.github.io/interpretable-ml-book/feature-importance.html#feature-importance
    # runtime: 9.4 hours
    fea_import_metrics={}
    valid_ds = data_feeder.gen_tf_dataset(val_filter)
    fea_import_metrics["baseline"] = model.evaluate(valid_ds, verbose=1)
    del valid_ds
    tf.random.set_seed(4321)
    start_i=74 if path_num==2 else 0
    for i in range(start_i, data_feeder.predictors_size):
        valid_ds_i = data_feeder.gen_tf_dataset(subset_filter=val_filter, column_idx=i)
        fea_import_metrics[data_columns[i+2]] = model.evaluate(valid_ds_i)
        del valid_ds_i
    # Save the metric of this model
    with open(f"./metrics/fea_import_lstm_final_pathnum{path_num}.pkl", "wb") as pickle_file:
        pickle.dump(fea_import_metrics, pickle_file)
    del data_feeder, model, fea_import_metrics


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]



 50%|██████████████████████████████████████▌                                      | 1/2 [3:42:50<3:42:50, 13370.05s/it]



100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [7:11:12<00:00, 12936.19s/it]


##### 2. DISPLAY LSTM FEATURE IMPORTANCE

In [15]:
from sys.model_analysis import plot_importance, save_plots_to_html, load_metrics

In [16]:
# 1. plot version:
factor_num_dict={1: "tech&fundamental",
                 2: "tech&calendar&fundamental&industry&release",
                 3: "tech&calendar&fundamental&release"}

plots = []
for path_num in [1,2,3]:
    fig = plot_importance(metrics_path=f"./metrics/fea_import_lstm_final_pathnum{path_num}.pkl",
                          factor=factor_num_dict[path_num], num=path_num)
    plots.append(fig)
    
save_plots_to_html(figures=plots, filename="./final_feature_importance_plots.html")

In [32]:
# 2. numerical version:
for path_num in [1,3,2]:
    test5_dict=load_metrics(f"./metrics/fea_import_lstm_final_pathnum{path_num}.pkl")
    test5_df = pd.DataFrame(test5_dict, index=["MSE", "R2"]).T
    print(f"\n{path_num}: ", factor_num_dict[path_num])
    print("baseline: ", test5_df.loc["baseline"].tolist())
    test5_df['sqrt_MSE_per_change'] = np.sqrt(test5_df['MSE'])/np.sqrt(test5_df.loc["baseline", 'MSE'])-1
    
    test5_df["abs_sqrt_MSE_per_change"]=test5_df['sqrt_MSE_per_change'].abs()
    print(test5_df[test5_df["abs_sqrt_MSE_per_change"]>0.001].sort_values(by=["abs_sqrt_MSE_per_change"], ascending=False).to_string())


1:  tech&fundamental
baseline:  [0.1632276177406311, 0.938768744468689]
                       MSE        R2  sqrt_MSE_per_change  abs_sqrt_MSE_per_change
lag_logvol_ma1    0.620996  0.768985             0.950508                 0.950508
lag_logvol_ma22   0.615579  0.770771             0.941981                 0.941981
lag_logvol_ma5    0.447396  0.833275             0.655577                 0.655577
lag_logvol_ma252  0.341693  0.872590             0.446842                 0.446842

3:  tech&calendar&fundamental&release
baseline:  [0.15162834525108337, 0.9434125423431396]
                       MSE        R2  sqrt_MSE_per_change  abs_sqrt_MSE_per_change
lag_logvol_ma5    0.808872  0.699419             1.309669                 1.309669
lag_logvol_ma1    0.567780  0.789116             0.935085                 0.935085
lag_logvol_ma252  0.450066  0.832505             0.722852                 0.722852
lag_logvol_ma22   0.263879  0.901694             0.319204                 0.319204
≥3   