# Permutation Importance

In [12]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate

import sys

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
from hmeasure import h_score
try:
  from catboost import CatBoostClassifier
except:
  !pip install catboost
  from catboost import CatBoostClassifier

import time
from datetime import timedelta

from sklearn.inspection import permutation_importance

In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [3]:
path_to_data = os.path.join(path_to_repo, "data","")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\


In [4]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\processed\


In [5]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\models\


In [18]:
path_to_figures = os.path.join(path_to_data,"figures","")
os.makedirs(path_to_figures, exist_ok=True) # we create the directory if it does not exist
print(path_to_figures)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\figures\


In [20]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set
tune_models = True # set to True if we want to perform parameter tuning

lemmatize = True # set to false if we want to do stemming
lemma_tag = str(np.where(lemmatize, "_lemma",""))
spacy = True
if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))

seed_tag = f'_{session_seed}'

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

random.seed(session_seed)

scoring = 'roc_auc' # what score should we use for permutation importance

In [7]:
def load_datasets(method, include_val = True, target = False):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    target: if we are importing our target variables
    """
    global path_to_processed
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}{target}train_{method}{seed_tag}{lemma_tag}')
    test = pd.read_feather(f'{path_to_processed}{target}test_{method}{seed_tag}{lemma_tag}')
    if include_val == True:
        val = pd.read_feather(f'{path_to_processed}{target}val_{method}{seed_tag}{lemma_tag}')
    else: val = []
    return train, test, val

In [8]:
y_train, y_test, y_val = load_datasets(method = '', include_val = include_val, target = True)

# Permutation Importance

In [16]:
def df_perm_importance(X_valid, result) :
    flabels = X_valid.columns
    impvalues = 100*result.importances_mean/np.max(result.importances_mean)
    return pd.DataFrame({"features": flabels, "importance":impvalues}).sort_values("importance", ascending=False)

In [19]:
# define a function that saves figures
def save_fig(fig_id, tight_layout=True):
    # The path of the figures folder ./Figures/fig_id.png (fig_id is a variable that you specify 
    # when you call the function)
    path = os.path.join(path_to_figures, fig_id + ".png") 
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

## Frequency

In [11]:
method = 'frequency'
train, test, val = load_datasets(method, include_val = include_val)

### GBoosting

In [14]:
model = 'gboost'
tune_models = False

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

### CatBoost

In [14]:
model = 'catboost'
tune_models = False

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

## OneHot

In [11]:
method = 'onehot'
train, test, val = load_datasets(method, include_val = include_val)

### Random Forest

In [14]:
model = 'rand_for'
tune_models = True

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

### LightGBM

In [14]:
model = 'lightgbm'
tune_models = True

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

## TF-IDF

In [11]:
method = 'tf_idf'
train, test, val = load_datasets(method, include_val = include_val)

### LightGBM

In [14]:
model = 'lightgbm'
tune_models = True

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

### CatBoost

In [14]:
model = 'catboost'
tune_models = False

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

## SVD

In [11]:
method = 'svd'
train, test, val = load_datasets(method, include_val = include_val)

### GBoosting

In [14]:
model = 'gboost'
tune_models = False

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

### LightGBM

In [14]:
model = 'lightgbm'
tune_models = True

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

## LDA

In [11]:
method = 'lda'
train, test, val = load_datasets(method, include_val = include_val)

### LightGBM

In [14]:
model = 'lightgbm'
tune_models = True

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)

### CatBoost

In [14]:
model = 'catboost'
tune_models = False

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

In [15]:
with open(f'{path_to_models}_{model}_{method}{seed_tag}{lemma_tag}', 'rb') as file:
    estimator = dill.load(file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_train = pickle.load(file)
    print('Importance loaded')
except:
    %time import_train = permutation_importance(estimator, train, y_train, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(train, import_train)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_train, file)

In [None]:
# Then we save all our results
try:
    with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'rb') as file:
        import_test = pickle.load(file)
    print('Importance loaded')
except:
    %time import_test = permutation_importance(estimator, test, y_val, n_jobs = -1, scoring = scoring, random_state = session_seed)

In [None]:
df_imp = df_perm_importance(test, import_test)
df_imp[:30].plot(x="features", y="importance", kind="barh", figsize = (12,10), legend = False).invert_yaxis()
save_fig(f'perm_imp_{method}_{model}')

In [None]:
# Then we save all our results
with open(f'{path_to_models}test_imp{model}{method}{lemma_tag}.pkl', 'wb') as file:
    pickle.dump(import_test, file)