In [0]:
from IPython import get_ipython


 # part 5: models

 construct gastrointestinal leak and VTE risk prediction models

 * 'analysis populations' refer to the training, validation, and test populations

 ## 1. preliminaries

In [0]:
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')



In [0]:
#from fastai.structured import *
#from fastai.column_data import *
import numpy as np
import pandas as pd
np.set_printoptions(threshold=50, edgeitems=20)



In [0]:
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler #ADASYN, SMOTE
import python_modules.constants as constants
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score, auc
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import torch as torch
from torch.utils.data import DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import os



In [0]:
from python_modules.pytorch_tabular import TabularDataset, FeedForwardNN 


In [0]:
# Set ipython's max row display
pd.set_option('display.max_row', 100)

# Set iPython's max column display
pd.set_option('display.max_columns', 50)



In [0]:
# seed random number generators
np.random.seed(6478)
torch.manual_seed(2368)
xgb_seed_leak = 32457
xgb_seed_clot = 21345



In [0]:
# Device configuration - included as a formality. we did not use a GPU so running on GPU is not actually tested. 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'); device


 ## 2. configure features and outcomes

 specify features to be used in each model and outcome to be predicted. these specifications dictate data processing and model building.

In [0]:
cat_vars = constants.CATEGORICAL_PRE
con_vars = constants.CONTINUOUS_PRE
cat_ord  = constants.CATEGORICAL_ORDER


 ## 3.  data processing

 * note: 'analysis populations' refer to the training, validation, and test populations

 ### 3.1 format and resample data for handling by models

 get data into correct types

In [0]:
def change_type(df, cats, cons, cat_ord, out, out_set):
    """
    gets features into correct data type
    also selects down to appropriate parameters
    ----------

    """
    for v in cats:
        df[v] = df[v].astype(CategoricalDtype(cat_ord[v], ordered=True))
    df[cats] = np.stack([c.cat.codes.values for n,c in df[cats].items()], 1).astype(np.int64)
    for v in cons:
        df[v] = df[v].astype('float32')
    return df[cats + cons + [out] + [out_set]]


 reindex dataframe by analysis population

In [0]:
def re_index(df, out_set):
    df.set_index(out_set, append=True, inplace=True, verify_integrity=False)
    return df


 standardize  continuous data, get a scaling function to map  validation and testing data

 * group by analysis population
 * scale and normalize test population; generate a standardization function to map transforms to other analysis populations
 * assume population mean for missing continous data
 * apply standardization to validation and testing populations
 * set `nan` to training population average, which is zero after standardization
 * confirm no `nan` remain - we set null continuous feature instances to the training population mean in the last step of the standardization function.

In [0]:
def standardize(df, scaler_fn, cons, out_set):
    #stratify by analysis pop
    #if you don't copy, overwrite errors ensue
    #use 50% of data for training, 25% for validation, and 25% for testing
    df_tr = df.xs('train', level=out_set).copy()
    df_tr2 = df.xs('val_1', level=out_set).copy()
    df_tr = pd.concat([df_tr, df_tr2])
    
    df_v = df.xs('val_2', level=out_set).copy()
    df_test = df.xs('test', level=out_set).copy()
    
    df_tr[cons] = scaler_fn.fit_transform(df_tr[cons].values)
    
    df_v[cons] = scaler_fn.transform(df_v[cons].values)
    df_test[cons] = scaler_fn.transform(df_test[cons].values)
    
    #zero out the unknown continuous variables; this sets them to training population average
    for v in cons:
        df_tr[v] = df_tr[v].fillna(0).astype('float32')
        df_v[v] = df_v[v].fillna(0).astype('float32')
        df_test[v] = df_test[v].fillna(0).astype('float32')
        
    #confirm no nan remain
    print('the number of nan remaining is: ' + str(df_tr.isnull().values.sum()))
    
    return df_tr, df_v, df_test


 balance
 * we can use pandas for simple oversampling. imbalanced-learn library implements advanced oversampling algorithms.
 * if using a synthetic oversampler, choose one that can synthesize categorical variables since most of the features in our model are categorical
 * [imblearn oversampling algorithms](https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-adasyn)
 * [good blog post on how to do this](https://beckernick.github.io/oversampling-modeling/)

 <b>Oversample only the training set. There is no reason to oversample any other set. Do not validate or test against oversampled data. Doing so will invalidate the model. </b>

 imbalance-learn appends resampled instances to the end of the data. (to prove this, run `df_tr_resam[-50:-1]['LEAK']` before and after shuffling). therefore, after resampling, reshuffle the oversampled data

In [0]:
def balance(df_tr, oversamp_fn, cats, cons, out_v):
    #split features from outcomes
    df_tr_y = df_tr[out_v].copy()
    df_tr_X = df_tr.drop([out_v], axis=1)
    df_tr_resam_X, df_tr_resam_y = oversamp_fn.fit_resample(df_tr_X, df_tr_y)
    
    #resampled data to dataframe
    df_tr_resam = pd.DataFrame(df_tr_resam_X, columns = df_tr_X.columns)
    
    #add outcome variable back to dataframe
    df_tr_resam[out_v] = df_tr_resam_y
    
    #shuffle
    df_tr_resam = df_tr_resam.sample(frac=1).reset_index(drop=True)
    
    #get resampled data back into the right type (it gets transformed in the resampling function)
    for v in cons:
        df_tr_resam[v] = df_tr_resam[v].astype('float32')
    for v in cats:
        df_tr_resam[v] = df_tr_resam[v].astype('int64')

    return df_tr_resam



In [0]:
def get_incidence_of_outcome(s):
    """
    Calculates incidence of outcome variable in a series
    Parameters
    ----------
    s: pandas series
      the series object containing boolean values for
      outcome of interest. 
    """

    cases = s.value_counts()[1]
    total = s.count()
    return cases, cases / total


 ### 3.2 import data

 process the data using the steps described above

 * the reason we print incidences of outcomes of interest in training, validation, and testing sets in this step is because after this step, cases are oversampled in the training set.

In [0]:
df_main = pd.read_csv('study_data/study_data_split.csv', low_memory=False, index_col=0)



In [0]:
def data_for_models(categorical_vars, categorical_order, continuous_vars, outcome_var, outcome_set):
    
    cat_vars = categorical_vars
    cat_vars_ord = categorical_order
    con_vars = continuous_vars
    outcome_var = outcome_var
    outcome_set = outcome_set
    
    df_main = pd.read_csv('study_data/study_data_split.csv', low_memory=False, index_col=0)
    scaler = StandardScaler()
    
    oversamp_ratio = 0.5
    oversampler = RandomOverSampler(ratio = oversamp_ratio)
    
    df_main = change_type(df_main, cat_vars, con_vars, cat_vars_ord, outcome_var, outcome_set)
        
    df_main = re_index(df_main, outcome_set)
    df_train, df_validate, df_test = standardize(df_main, scaler, con_vars, outcome_set)
    
    print('incidence in training set -', get_incidence_of_outcome(df_train[outcome_var]))
    print('incidence in validation set -', get_incidence_of_outcome(df_validate[outcome_var]))
    print('incidence in testing set -', get_incidence_of_outcome(df_test[outcome_var]))
    
    df_train_rebalanced = balance(df_train, oversampler, cat_vars, con_vars, outcome_var)
    
    return df_train_rebalanced, df_validate, df_test, df_train


 reformat the data so it can be used in all analyses below:

In [0]:
def import_data(outcome_var, outcome_set):
    
    data = {}
    
    training, validation, testing, training_original = data_for_models(cat_vars, cat_ord, con_vars, outcome_var, outcome_set)
    
    # 2019-09-23 for revisions:
    # save the data generated to train models for each outcome
    # this will be used to generate new logistic regression models
    # this is to respond to reviewer comments asking for more info on LR coefficients
    print('export scaled & resampled data for analysis with statsmodels')
    training.to_csv('study_data/' + outcome_var + '_training.csv')
    validation.to_csv('study_data/' + outcome_var + '_validation.csv')
    testing.to_csv('study_data/' + outcome_var + '_testing.csv')
    training_original.to_csv('study_data/' + outcome_var + '_training_original.csv')
    
    # original submission code:
    
    train_labels = training[outcome_var]
    train_features = training.drop(outcome_var, axis=1)

    valid_labels = validation[outcome_var]
    valid_features = validation.drop(outcome_var, axis=1)

    testing_labels = testing[outcome_var]
    testing_features = testing.drop(outcome_var, axis=1)

    training_original_labels = training_original[outcome_var]
    training_original_features = training_original.drop(outcome_var, axis=1)
    
    data['training'] = training
    data['validation'] = validation
    data['testing'] = testing
    data['training_original'] = training_original
    
    data['train_labels'] = train_labels
    data['train_features'] = train_features
    data['valid_labels'] = valid_labels
    data['valid_features'] = valid_features
    data['testing_labels'] = testing_labels
    data['testing_features'] = testing_features
    
    # the train_original_ label allows us to use non-oversampled training data to check model performance
    # on the original training cohort (just to make sure results make sense and that we're not overfitting etc)
    data['train_original_labels'] = training_original_labels
    data['train_original_features'] = training_original_features
    return data


 ## 4. configure models

 ### 4.1 artificial neural network

In [0]:
def prep_train_data(data, features_cat, features_con, outcome):
    d_trv = data[features_cat + features_con + [outcome]]
    d_trv = TabularDataset(data=d_trv, cat_cols=features_cat, output_col=outcome)
    d_trv = DataLoader(d_trv, 640, shuffle=True, num_workers=1)
    return d_trv

def prep_test_val_data(data, features_cat, features_con, outcome):
    batch_size = len(data)
    d_trv = data[features_cat + features_con + [outcome]]
    d_trv = TabularDataset(data=d_trv, cat_cols=features_cat, output_col=outcome)
    d_trv = DataLoader(d_trv, batch_size, shuffle=False, num_workers=1)
    return d_trv

def get_net(data, features_cat, features_con):
    cat_dims = [int(data[col].nunique()) for col in features_cat]
    emb_dims = [(x+1, 5) for x in cat_dims]
    lls = [550]
    lld = [0.4 for i in lls]
    net = FeedForwardNN(emb_dims, no_of_cont=len(features_con), 
                    lin_layer_sizes = lls,
                    output_size=1, emb_dropout=0.2,
                    lin_layer_dropouts = lld).to(device)
    return net

def ann_train(net, crit, opt, training_data, losses_train):
    net.train()
    epoch_loss = []
    
    for y, cont_x, cat_x in training_data:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y = y.to(device)
    
        # Forward Pass
        preds = net(cont_x, cat_x)
        loss = crit(preds, y)
    
        # Backward Pass and Optimization
        opt.zero_grad()
        loss.backward()
        opt.step()
        # record losses
        epoch_loss.append(loss.data.numpy())
    
    losses_train.append(np.mean(epoch_loss))

def evaluate(net, crit, val_data, losses_val, aucs_val):
    net.eval()
    epoch_loss = []
    
    for y, cont_x, cat_x in val_data:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y = y.to(device)
        preds = net(cont_x, cat_x)
        loss = crit(preds, y)                    
        pred_y = preds.data.numpy()
        target_y = y.data.numpy()  
        epoch_loss.append(loss.data.numpy())
        
    fpr, tpr, _ = roc_curve(target_y, pred_y)
    aucs_val.append(auc(fpr, tpr))
    losses_val.append(np.mean(epoch_loss))
    
def ann_test(net, crit, test_data):
    net.eval()
    epoch_loss = []
    
    for y, cont_x, cat_x in test_data:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y = y.to(device)
        preds = net(cont_x, cat_x)
        loss = crit(preds, y)                    
        pred_y = preds.data.numpy()
        target_y = y.data.numpy()  
        epoch_loss.append(loss.data.numpy())
        mean_loss = np.mean(epoch_loss)
            
    results = {'preds': pred_y,
               'loss': mean_loss}
    
    return results
        
def train_and_val(net, dl_train, dl_valid, path_to_model, patience = 4, max_its = 20, min_its = 0):
    opt_slow = torch.optim.Adam(net.parameters(), lr=0.0001)
    criterion = nn.BCELoss()
    losses_train = []
    losses_val = []
    aucs_val = []
    best_model = []
    patience = patience
    eval_loss = 0
    max_its = max_its
    min_its = min_its
    it = 0
    save_best = False
    while it < max_its:
        ann_train(net, criterion, opt_slow, dl_train, losses_train)
        evaluate(net, criterion, dl_valid, losses_val, aucs_val)
        
        #save best model:
        if it == np.argmax(aucs_val):
            update = ', validation auc: ' + str(aucs_val[-1]) + '***'
            torch.save(net.state_dict(), path_to_model)
        
        else:
            update = ''
        
        #evaluate ending criteria:
        if it > min_its:
        
            if len(aucs_val) - np.argmax(aucs_val) > patience:
                #print('ending after epoch ' + str(it))
                print('END epoch ' + str(it) + update)
                print('the best model saved after epoch ' + str(np.argmax(aucs_val)))
                break
                
        print('epoch ' + str(it) + update)
        it += 1
    return losses_train, losses_val, aucs_val
        


 ### 4.2 gradient boosting machine

In [0]:
def xgboost_train(train_features, train_labels, xgb_seed):
    
    tf = train_features.copy()
    
    for v in cat_vars: 
        tf[v] = tf[v].astype('str')
        
    tf = pd.get_dummies(tf, columns = cat_vars, drop_first=True )
    
    model = XGBClassifier(seed=xgb_seed)
    model.fit(tf, train_labels)
    var_names = list(tf)
    var_imps = model.feature_importances_#rf_feat_importance(model, tf)
    df_vi = pd.DataFrame({'var':var_names, 'imp':var_imps})
    
    
    
    return model, df_vi



In [0]:
def xgboost_test(model, test_features):

    vf = test_features.copy()
    
    for v in cat_vars: 
        vf[v] = vf[v].astype('str')
    
    vf = pd.get_dummies(vf, columns = cat_vars, drop_first=True)

    guess = model.predict_proba(vf) #compare it to test_target_cat

    return guess[:,1]


 ### 4.3 logistic regression

 standard encoding

In [0]:
def lr_fit(train_features, train_labels):
    
    coeffs = {}
    
    clf = LogisticRegression(solver='saga', max_iter=9000)
    clf.fit(train_features, train_labels)
    coeffs['names'] = list(train_features)
    coeffs['vals'] = clf.coef_
    
    return clf, coeffs

def lr_test(model, test_features):
    guess_lr = model.predict_proba(test_features) #compare it to test_target_cat
    return guess_lr[:,1]


 one-hot encoding

In [0]:
def lr_fit_onehot(train_features, train_labels):
    
    tf = train_features.copy()
    
    for v in cat_vars: 
        tf[v] = tf[v].astype('str')
        
    tf = pd.get_dummies(tf, columns = cat_vars, drop_first=True)
    
    coeffs = {}
    
    clf = LogisticRegression(solver='saga', max_iter=9000)
    clf.fit(tf, train_labels)
    coeffs['names'] = list(train_features)
    coeffs['vals'] = clf.coef_
    
    return clf, coeffs

def lr_test_onehot(model, test_features):
    tf = test_features.copy()
    
    for v in cat_vars: 
        tf[v] = tf[v].astype('str')
        
    tf = pd.get_dummies(tf, columns = cat_vars, drop_first=True)
    
    guess_lr = model.predict_proba(tf) #compare it to test_target_cat
    return guess_lr[:,1]


 ### 4.4 utilities for running models and processing results

In [0]:
def run_models(outcome_var, outcome_set, output_path_modifier, xgb_seed):
    
    # specify paths for saving neural net and model results
    # make dirs to hold outputs
    # if folders already exist, this will throw errors
    # use separate subdirectories for leak and clot results otherwise confusing + annoying errors
    
    # path to best ANN in training and allows us to save and recover it once training loop ends
    PATH_MODEL = f'results/best_ann_{outcome_var}/' 
    os.mkdir(PATH_MODEL)
    BEST_MODEL = f'{PATH_MODEL}ANN_{outcome_var}.pt'
    
    # path to  results
    PATH_RESULTS = f'results/study_models_{outcome_var}/'
    os.mkdir(PATH_RESULTS)
    
    #import data
    print('IMPORT DATA')
    data = import_data(outcome_var, outcome_set)
    
    #ann - train
    print('\n')
    print('TRAIN NEURAL NET')
    ann_training = prep_train_data(data['training'], cat_vars, con_vars, outcome_var)
    ann_training_performance = prep_test_val_data(data['training_original'], cat_vars, con_vars, outcome_var)
    ann_validate = prep_test_val_data(data['validation'], cat_vars, con_vars, outcome_var)
    ann_testing = prep_test_val_data(data['testing'], cat_vars, con_vars, outcome_var)

    ann = get_net(data['training'], cat_vars, con_vars)
    ann_losses_training, ann_losses_validate, ann_aucs_validate = train_and_val(ann, ann_training, ann_validate, BEST_MODEL, max_its = 900, patience = 100, min_its = 30)

    # recover best ann
    ann_best = get_net(data['training'], cat_vars, con_vars)
    ann_best.load_state_dict(torch.load(BEST_MODEL))
    
    # plot ann training and validation losses
    print('\n')
    print('NEURAL NET - TRAINING LOSS')
    plt.plot(ann_losses_training)
    plt.title('training loss', fontdict=None, loc='center', pad=None)
    plt.show()
    
    print('\n')
    print('NEURAL NET - VALIDATION LOSS')
    plt.plot(ann_losses_validate)
    plt.title('validation loss', fontdict=None, loc='center', pad=None)
    plt.show()
    
    print('\n')
    print('NEURAL NET - VALIDATION AUCS')
    plt.plot(ann_aucs_validate)
    plt.title('validation AUCs', fontdict=None, loc='center', pad=None)
    plt.show()
    
    # evaluate ann against validation and test data
    ann_train_results = ann_test(ann_best, nn.BCELoss(), ann_training_performance)
    ann_valid_results = ann_test(ann_best, nn.BCELoss(), ann_validate)
    ann_test_results = ann_test(ann_best, nn.BCELoss(), ann_testing)
    print('\n')
    print('NEURAL NET - done')
        
    # xgb
    xgb_model, xgb_var_imp = xgboost_train(data['train_features'], data['train_labels'], xgb_seed)
    xgb_train_results = xgboost_test(xgb_model, data['train_original_features'])
    xgb_valid_results = xgboost_test(xgb_model, data['valid_features'])
    xgb_testing_results = xgboost_test(xgb_model, data['testing_features'])
    print('\n')
    print('GRADIENT BOOSTING MACHINE - done') 
    
    # lr
    lr_model, lr_var_coeffs = lr_fit(data['train_features'], data['train_labels'])
    lr_train_results = lr_test(lr_model, data['train_original_features'])
    lr_valid_results = lr_test(lr_model, data['valid_features'])
    lr_testing_results = lr_test(lr_model, data['testing_features'])
    print('\n')
    print('LOGISTIC REGRESSION - done')
    
    # lr - onehot version
    lr_onehot_model, lr_onehot_var_coeffs = lr_fit_onehot(data['train_features'], data['train_labels'])
    lr_onehot_train_results = lr_test_onehot(lr_onehot_model, data['train_original_features'])
    lr_onehot_valid_results = lr_test_onehot(lr_onehot_model, data['valid_features'])
    lr_onehot_testing_results = lr_test_onehot(lr_onehot_model, data['testing_features'])
    print('\n')
    print('ONE HOT LOGISTIC REGRESSION - done')  
    
    # dfs to hold results
    df_results_train = pd.DataFrame()
    df_results_valid = pd.DataFrame()
    df_results_test = pd.DataFrame()
    
    # labels
    df_results_train['targs'] = data['train_original_labels']
    df_results_valid['targs'] = data['valid_labels']
    df_results_test['targs'] = data['testing_labels']
    
    # store results in dataframes
    df_results_train['ann'] = ann_train_results['preds']
    df_results_valid['ann'] = ann_valid_results['preds']
    df_results_test['ann'] = ann_test_results['preds']
    df_results_train['xgb'] = xgb_train_results
    df_results_valid['xgb'] = xgb_valid_results
    df_results_test['xgb'] = xgb_testing_results
    df_results_train['lr'] = lr_train_results
    df_results_valid['lr'] = lr_valid_results
    df_results_test['lr'] = lr_testing_results
    df_results_train['lr_onehot'] = lr_onehot_train_results
    df_results_valid['lr_onehot'] = lr_onehot_valid_results
    df_results_test['lr_onehot'] = lr_onehot_testing_results
    
    # save results - so that they can be evaluated in R
    df_results_train.to_csv(f'{PATH_RESULTS}{outcome_var}_train_{output_path_modifier}.csv')
    df_results_valid.to_csv(f'{PATH_RESULTS}{outcome_var}_valid_{output_path_modifier}.csv')
    df_results_test.to_csv(f'{PATH_RESULTS}{outcome_var}_test_{output_path_modifier}.csv')
    
    print('\n')
    print('SAVE RESULTS - done') 
    return df_results_valid, df_results_test, ann_valid_results['loss'], ann_test_results['loss'], xgb_var_imp, lr_var_coeffs



In [0]:
def generate_results_roc(y_test, y_score):
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f' + str(roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, .2])
    plt.ylim([0.0, .6])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.show()
    print('AUC: ' , roc_auc)

def generate_results_pr(y_test, y_score):
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    plt.figure()
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
    
    plt.xlabel('True Positive Rate')
    plt.ylabel('Positive Predictive Value')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 0.2])
    plt.title('Precision-recall curve')
    plt.show()

def compare_rocs(results_frame, list_y_preds, y_targets):
    plt.figure(figsize=(20,10))
    for r in list_y_preds:
        fpr, tpr, _ = roc_curve(results_frame[y_targets], results_frame[r])
        roc_auc = auc(fpr, tpr)
        print(r + ' AUC: ' , roc_auc)
        plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.show()


 ## 5. train and test models

 ### 5.1 leak

In [0]:
leak_outcome = 'LEAK'
leak_set = 'LEAK_SET'



In [0]:
df_leak_res_val, df_leak_res_test, ann_leak_valid_loss, ann_leak_test_loss, xgb_var_imp_leak, lr_var_imp_leak = run_models(leak_outcome, leak_set, 
                                                                                        'FINAL', 
                                                                                        xgb_seed_leak)


 compare validation and testing results for neural net

In [0]:
print('compare artificial neural net validation and testing loss to ensure they are reasonably close')
print('* validation loss = ' + str(ann_leak_valid_loss))
print('* testing loss = ' + str(ann_leak_test_loss))


 preliminary comparison of results - final results processed in R for easier statistical comparison of AUCs.

In [0]:
compare_rocs(df_leak_res_val, ['ann', 'xgb', 'lr', 'lr_onehot'], 'targs')



In [0]:
compare_rocs(df_leak_res_test, ['ann', 'xgb', 'lr', 'lr_onehot'], 'targs')


 variable importance

 ### 5.2 clot

In [0]:
clot_outcome = 'CLOT'
clot_set = 'CLOT_SET'



In [0]:
df_clot_res_val, df_clot_res_test, ann_clot_valid_loss, ann_clot_test_loss, xgb_var_imp_clot, lr_var_imp_clot = run_models(clot_outcome, 
                                                                                        clot_set, 'FINAL', 
                                                                                        xgb_seed_clot)


 same as above, some preliminary analysis. first, compare neural net validation and testing results. then, compare various models. again, we process final results in R which has very nice tools for statistical comparison of AUCs.

In [0]:
print('compare artificial neural net validation and testing loss to ensure they are reasonably close')
print('* validation loss = ' + str(ann_clot_valid_loss))
print('* testing loss = ' + str(ann_clot_test_loss))



In [0]:
compare_rocs(df_clot_res_val, ['ann', 'xgb', 'lr', 'lr_onehot'], 'targs')



In [0]:
compare_rocs(df_clot_res_test, ['ann', 'xgb', 'lr',  'lr_onehot'], 'targs')


 ### variable importance

In [0]:
#this is just a dict that helps format variable names for figures
var_imp_name_mapper = {'VENOUS_STASIS Yes': 'Venous stasis: Yes',
                       'FUNSTATPRESURG Partially Dependent': 'Functional status: Partially Dependent',
                       'HIP Yes': 'HTN: Yes',
                       'COPD Yes': 'COPD: Yes',
                       'SMOKER Yes': 'Smoker: Yes',
                       'race_PUF White': 'Race: White',
                       'DIABETES Insulin': 'Diabetes: Insulin-dependent',
                       'BMI_DELTA': 'Change in BMI',
                       'race_PUF Black or African American': 'Race: Black or African American',
                       'race_PUF American Indian or Alaska Native': 'Race: American Indian or Alaska Native',
                       'BMI_CONSOL': 'Preop BMI',
                       'ASSISTANT_TRAINING_LEVEL Minimally Invasive Surgery Fellow': '1st asst. training: MIS Fellow',
                       'CHRONIC_STEROIDS Yes': 'Chronic steroids: Yes',
                       'MOBILITY_DEVICE Yes': 'Limited ambulation: Yes',
                       'hispanic Yes': 'Hispanic ethnicity: Yes',
                       'PCARD Yes': 'Previous cardiac surgery: Yes',
                       'ASACLASS 2-Mild Disturb': 'ASA class: 2',
                       'DIABETES Non-Insulin': 'Diabetes: No insulin',
                       'ASSISTANT_TRAINING_LEVEL Resident (PGY 1-5+)': '1st asst. training: Resident',
                       'IVCF No': 'IVCF: No',
                       'IVCF IVC filter placed in anticipation of the metabolic or bariatric procedure': 'IVCF: Placed for surgery',
                       'hispanic No': 'Hispanic ethnicity: No',
                       'THERAPEUTIC_ANTICOAGULATION Yes': 'Anticoagulation: Yes',
                       'ASSISTANT_TRAINING_LEVEL Attending - Other': '1st asst. training: attending',
                       'SLEEP_APNEA Yes': 'Sleep apnea: Yes',
                       'HTN_MEDS 2': 'No. anti-HTN: 2',
                       'HGTCM': 'Height (cm)',
                       'AGE_CONSOL': 'Age',
                       'FUNSTATPRESURG Totally Dependent':'Functional status: Dependent',
                       'ALBUMIN': 'Albumin',
                       'SEX Male': 'Sex: Male',
                       'HCT': 'Hematocrit',
                       'race_PUF American Indian or Alaska Native': 'Race: American Indian or Alaska Native',
                       'race_PUF Native Hawaiian or Other Pacific Islander': 'Race: Native Hawaiian or Other Pacific Islander',
                       'HISTORY_PE Yes': 'History of PE: Yes',
                       'HISTORY_DVT Yes' :'History of DVT: Yes',
                       'OXYGEN_DEPENDENT Yes' : 'Oxygen dependent: Yes',
                       'MI_ALL_HISTORY Yes':'History of MI: Yes',
                       'GERD Yes': 'GERD: Yes',
                       'ASSISTANT_TRAINING_LEVEL Attending - Weight Loss Surgeon': '1st asst. training: weight loss surgeon',
                       'CPT 43644': 'Procedure: Bypass',
                       'WGTKG': 'Weight (kg)',
                       'OPYEAR 2016': 'Year: 2016',
                       'IVCF IVC filter was pre-existing': 'IVCF: Pre-existing',
                       'PTC Yes' : 'Previous PCI or angioplasty',
                       'RENAL_INSUFFICIENCY Yes': 'Renal insufficiency: No',
                       'OPYEAR 2017':'Year: 2017',
                       'ASACLASS 1-No Disturb': 'ASA class: 1',
                       'ASACLASS 4-Life Threat': 'ASA class: 4',
                       'ASACLASS 3-Severe Disturb': 'ASA class: 3',
                       'HTN_MEDS 3+': 'No. anti-HTN: 3+',
                       'ASSISTANT_TRAINING_LEVEL Physician Assistant/Nurse Practitioner/Registered Nurse First Assist': '1st asst. training: PA/NP/RN ',
                       'HYPERLIPIDEMIA Yes': 'HLD: Yes',
                       'HTN_MEDS 1': 'No. anti-HTN: 0',
                       'DIALYSIS Yes': 'Dialysis: Yes'}



In [0]:
def get_variable_importance(vardata, outcome_path):
    vardata_filtered = vardata[vardata['imp'] != 0].copy()
    relative_imp = [i/np.max(vardata_filtered['imp']) for i in vardata_filtered['imp']]
    vardata_filtered['relative_imp'] = relative_imp
    mapnames = []
    
    # requires some str manipulation to match strings and to get things into the right format
    for v in vardata_filtered['var']:
        if v[0:-2] in cat_vars:
            rank = int(v[-1:])
            n = var_imp_name_mapper[v[0:-2] + ' ' + str(cat_ord[v[0:-2]][rank])]
            mapnames += [n]
        else:
            mapnames += [var_imp_name_mapper[v]]
    vardata_filtered['names'] = mapnames
    vardata_filtered.to_csv(f'results/study_models_{outcome_path}/{outcome_path}_xgb_var_imp.csv')
    return vardata_filtered



In [0]:
var_imp_leak = get_variable_importance(xgb_var_imp_leak, 'LEAK')
var_imp_clot = get_variable_importance(xgb_var_imp_clot, 'CLOT')

