In [None]:
# import packages
import pickle
import os
import re
#from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, f1_score, accuracy_score, roc_auc_score, roc_curve, auc, recall_score

In [None]:
# create function to import models
def get_exported_models(import_dir):
    """
     This function takes the path to
     the import directory
     for each lagged model
    """
    # initialize empty list to return
    models = []
    # loop over all the files in the import directory
    # we can use the listdir() method on the os class
    # this lists all the files and sub-directories
    for file in os.listdir(import_dir):
        # make sure the file name ends with .pkl
        if file.endswith('.pkl'):
            # get the file name
            # this returns tuple of file name and extension
            # so we pick [0] to get the file name
            file_name = os.path.splitext(file)[0]
            # read the file and append
            # get full file path
            # we need to do this to read from the pickle file
            filepath = os.path.join(import_dir, file)
            # call pickle to extract file
            # we now use rb to read binary since we exported as bin
            with open(filepath, "rb") as f:
                model = pickle.load(f)
            # append to list
            models.append((file_name, model))
    # return the list
    return models

In [None]:
# create the folders to import the models and export results
# colab folders/files only exist at runtime so we need to recreate
# them everytime we disconnect and want to run the model again
parent_dir_models = '/content/models'
import_folders = ['gbc-models', 'logit-models', 'nn-models', 'ran-forest-models', 'svc-models']
# loop and create
for folder in import_folders:
  folder_path = os.path.join(parent_dir_models, folder)
  os.makedirs(folder_path, exist_ok=True)
# create sep folder for data
# we use this to store the summary
# and threshold data which we will import
parent_dir_other = '/content/'
# create other directories we need
other_dirs = ['data', 'cm-roc-plots', 'cm-shap-plots']
# loop and create
for folder in other_dirs:
  folder_path = os.path.join(parent_dir_other, folder)
  os.makedirs(folder_path, exist_ok=True)

In [None]:
# import all models
#-----------------#

# import logit models
logit_import_dir = "/content/models/logit-models"
# run and extract
logit_models = get_exported_models(logit_import_dir)

# import random forest models
random_forest_import_dir = "/content/models/ran-forest-models"
# run and extract
random_forest_models = get_exported_models(random_forest_import_dir)

# import the nn models
nn_import_dir = "/content/models/nn-models"
# run and extract
nn_models = get_exported_models(nn_import_dir)

# import the svc models
svc_import_dir = "/content/models/svc-models"
# run and extract
svc_models = get_exported_models(svc_import_dir)

# import gradient boosting models
gradient_boosting_models = "/content/models/gbc-models"
# run and extract
gradient_boosting_models = get_exported_models(gradient_boosting_models)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: node array from the pickle has an incompatible dtype:
- expected: [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]
- got     : {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}

In [None]:
# import and load data
#--------------------#

# import raw data for testing
data_raw = pd.read_excel('/content/data/data.xlsx', index_col=0)

# import data to apply the correct threshold to the model
model_summary = pd.read_excel('/content/data/all-summaries.xlsx')
# extract the lag, threshold, and model name column
threshold_lag_data = model_summary[['lag', 'threshold', 'model']]

In [None]:
# helper function to flatten list of nested lists
# the predicted values are returned like this:
# i.e. [[0],[1],[0],[1]]
# we need to convert it to this:
# i.e. [0,1,0,1]
def flatten_list(matrix_list):
    flat_list = []
    for row in matrix_list:
        flat_list.append(row[0])
    return flat_list

In [None]:
# set up helper function to extract the correct model
# based on the lag, this takes the model list and the
def extract_correct_lag_model(models, lag):
    """This function take models list
       and the lag to match"""
    # create regex pattern based on lag
    lag_match_pattern = rf"{lag}-month-lag"
    # loop over models
    for model in models:
        # use .search() method to check the entire string
        # check the first part of tuple for name
        # then return second part with the model
        if re.search(lag_match_pattern, model[0]):
            return model[1]
    # if no model is found return none
    # this should never be triggered but good
    # to include just in case
    return None

In [None]:
# create helper function to get the correct threshold for the given lag
def extract_correct_threshold(model, data, lag):
    """This function takes a dataframe
       of the threshold data and lags
       as well as the model name"""
    # get the correct entry
    entry = data[(data['lag'] == lag) & (data['model'] == model)]
    # get the threshold
    # access the value threshold from column to extract
    # the raw number, only one val so we access 1st index
    threshold_value = entry['threshold'].values[0]
    return threshold_value

In [None]:
# create helper function to make vote classification
# this will be applied to each row in the prediction dataframe
# if the sum is greater than or equal to 3, we assign a model vote
# of 1, otherwise 0
def assign_vote(row):
    # check row sum
    if row.sum() >= 3:
        return 1
    else:
        return 0

In [None]:
# set up lags for looping
lags = [3, 6, 9, 12, 18]

In [None]:
# the models are already trained so we just need to test the models
# create function to run
def run_concensus_model(data, lag, test_size, summary_data):
    """This model accepts testing data and a lag
       it then uses a simple voting concensus to determine
       class, i.e. if a majority of models say 1,
       then the model returns 1"""

    # make a copy of the original DataFrame to avoid modifying it
    data_copy = data.copy()

    # modify dataset for lag
    # we want to set the recession indicator back by the lag so that t0 is aligned with t+lag
    data_copy[f"nber_recession_{lag}_month_lag"] = data_copy['nber_recession'].shift(-lag)

    # drop the original recession column and na values
    data_copy = data_copy.drop(columns=['nber_recession'])
    data_copy = data_copy.dropna()

    # set up training and testing data
    X = data_copy.drop(columns=[f"nber_recession_{lag}_month_lag"])
    y = data_copy[f"nber_recession_{lag}_month_lag"]

    # set up training and testing data
    # we don't need the trainin data in this case, only the testing data
    _, X_test, _, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # extract the relevant model for the given lag
    random_forest = extract_correct_lag_model(random_forest_models, lag)
    svc = extract_correct_lag_model(svc_models, lag)
    gradient_boosting = extract_correct_lag_model(gradient_boosting_models, lag)
    neural_network = extract_correct_lag_model(nn_models, lag)
    logit_model = extract_correct_lag_model(logit_models, lag)

    # get the threshold for the relevant lag
    rf_threshold = extract_correct_threshold('rf', summary_data, lag)
    svc_threshold = extract_correct_threshold('svc', summary_data, lag)
    gbc_threshold = extract_correct_threshold('gbc', summary_data, lag)
    nn_threshold = extract_correct_threshold('nn', summary_data, lag)
    logit_threshold = extract_correct_threshold('logit', summary_data, lag)

    # make the calculations for each model
    rand_forest_pred = (random_forest.predict_proba(X_test)[:,1] > rf_threshold).astype(int)
    svc_pred = (svc.predict_proba(X_test)[:,1] > svc_threshold).astype(int)
    gbc_pred = (gradient_boosting.predict_proba(X_test)[:,1] > gbc_threshold).astype(int)
    logit_pred = (logit_model.predict_proba(X_test)[:,1] > logit_threshold).astype(int)

    # neural network needs to handled a bit differently
    # we first predict directly, then flatten list
    # of lists which it returns
    nn_predictions_prob_raw = neural_network.predict(X_test)
    # flatten list of lists
    nn_predictions_prob = flatten_list(nn_predictions_prob_raw)
    nn_pred = (nn_predictions_prob > nn_threshold).astype(int)

    # create a dataframe of the concensus predictions
    model_votes = pd.DataFrame({
    'rand_forest_pred': rand_forest_pred,
    'svc_pred': svc_pred,
    'gbc_pred': gbc_pred,
    'nn_pred': nn_pred,
    'logit_pred': logit_pred
    })

    # use simple voting system to dtermine whether to make a positive classfication
    # if three or more models give a positive classification, then 1 otherwise 0
    # we use the apply fumction to then use the voting function across each row
    model_votes['concensus_vote'] = model_votes.apply(assign_vote, axis=1)

    # now we can comparet the accuracy of the concensus model
    y_pred = model_votes['concensus_vote'].values

    # create a confusion matrix to visualize results
    conf_mat = confusion_matrix(y_test, y_pred)

    # get predicted values and metrics
    metrics_obj= {
       'accuracy': accuracy_score(y_test, y_pred),
       'precision': precision_score(y_test, y_pred),
       'recall': recall_score(y_test, y_pred),
       'f1': f1_score(y_test, y_pred),
       'roc_auc': roc_auc_score(y_test, y_pred),
       }

    # return summary output
    return {'data': data_copy,
            'model_votes': model_votes,
            'y_true': y_test,
            'predicted_vals_binary': y_pred,
            'confusion_matrix': conf_mat,
            'model_metrics': metrics_obj}

In [None]:
# run the model for each lag
concensus_results = [(f"{lag}_month_lag_results", run_concensus_model(data_raw, lag, 0.2, threshold_lag_data)) for lag in lags]



[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.0s finished




[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished




[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished


In [None]:
# make a dataframe of all accuracy results
headers_metrics = ['lag', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'conf_matrix']
# store the results for each iteration
iteration_metrics = []
# iterate over results
for result in concensus_results:
    # extract from the tuple
    metrics = result[1]['model_metrics']
    # extract each value
    values = [val for _, val in metrics.items()]
    # insert name of lag
    values.insert(0, result[0])
    # get the confusion matric
    conf_matrix = result[1]['confusion_matrix']
    # append to values
    values.append(conf_matrix)
    # append to the list
    iteration_metrics.append(values)
# convert to a dataframe
metric_data = pd.DataFrame(iteration_metrics, columns=headers_metrics)

print(metric_data)

                    lag  accuracy  precision    recall        f1   roc_auc  \
0   3_month_lag_results  0.952703   0.733333  0.785714  0.758621  0.877932   
1   6_month_lag_results  0.939189   0.666667  0.800000  0.727273  0.877444   
2   9_month_lag_results  0.904762   0.619048  0.684211  0.650000  0.810855   
3  12_month_lag_results  0.890411   0.523810  0.647059  0.578947  0.784770   
4  18_month_lag_results  0.910345   0.681818  0.714286  0.697674  0.828917   

            conf_matrix  
0   [[130, 4], [3, 11]]  
1   [[127, 6], [3, 12]]  
2   [[120, 8], [6, 13]]  
3  [[119, 10], [6, 11]]  
4   [[117, 7], [6, 15]]  


In [None]:
# go through and see if the model is over or underestimating recessions
headers_false_true_summary = ['lag', 'recession_true', 'recession_true_pred', 'recession_false', 'recession_false_pred', 'false_pos_rate', 'false_neg_rate']

# store iteration calculations
iteration_summaries = []

# loop over data
for result in concensus_results:
    # extract the relevant data
    data = result[1]
    y_true_pred = pd.DataFrame({'y_actual': data['y_true'], 'y_predicted': data['predicted_vals_binary']})

    # create row of data with the calculations
    true_pos = np.sum(y_true_pred['y_actual'] == 1)
    true_neg = np.sum(y_true_pred['y_actual'] == 0)
    pred_pos = np.sum(y_true_pred['y_predicted'] == 1)
    false_pos_rate = np.sum((y_true_pred['y_actual'] == 0) & (y_true_pred['y_predicted'] == 1)) / (np.sum(y_true_pred['y_actual'] == 0))
    false_neg_rate = np.sum((y_true_pred['y_actual'] == 1) & (y_true_pred['y_predicted'] == 0)) / (np.sum(y_true_pred['y_actual'] == 1))

    # create a list of the stats to pass in
    summary_stats = [true_pos, pred_pos, true_neg, len(y_true_pred) - pred_pos, false_pos_rate, false_neg_rate]

    # insert lag name
    summary_stats.insert(0, result[0])

    # append to result list
    iteration_summaries.append(summary_stats)

# convert to df
complete_summary_stats = pd.DataFrame(iteration_summaries, columns=headers_false_true_summary)

# print results
print(complete_summary_stats)

                    lag  recession_true  recession_true_pred  recession_false  \
0   3_month_lag_results              14                   15              134   
1   6_month_lag_results              15                   18              133   
2   9_month_lag_results              19                   21              128   
3  12_month_lag_results              17                   21              129   
4  18_month_lag_results              21                   22              124   

   recession_false_pred  false_pos_rate  false_neg_rate  
0                   133        0.029851        0.214286  
1                   130        0.045113        0.200000  
2                   126        0.062500        0.315789  
3                   125        0.077519        0.352941  
4                   123        0.056452        0.285714  


In [None]:
# set up writer to export the summary stats
path = '/content/concensus-summary.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')

In [None]:
# export to excel
metric_data.to_excel(writer, sheet_name='summary_stats', index=False)

In [None]:
# add summary stats to excel output
complete_summary_stats.to_excel(writer, sheet_name='pos_neg_acc_summary', index=False)
# close writer
writer.close()