In [None]:
import pickle
import pandas as pd
import numpy as np
import shap
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report, auc, roc_curve, precision_recall_curve
from statsmodels.stats.diagnostic import linear_reset
import statsmodels.api as sm
from scipy import stats

In [None]:
# load the model and get the data for the app
fpath = "./models/xgb_classifier_train_test_without_specialty_5y.pkl"
with open(fpath,"rb") as open_file:
    vars = pickle.load(open_file)
# x_train and y_train should just be x and y
classify_xgb,X_train,y_train,X_train_ids,train_ids = vars

y_train_pred = classify_xgb.predict(X_train)
y_train_pred_prob = classify_xgb.predict_proba(X_train)

In [None]:
feature_name_dict = {
    "tenure":"Tenure",
    "age_group": "Age group",
    "EWA_avg_risk_avg":"Exp. weighted panel complexity",
    "EWA_avg_note_quality_manual_value": "Exp. weighted note quality",
    "EWA_avg_note_quality_contribution_value":"Exp. weighted note quality contribution",
    "note_quality_manual_value": "Note quality",
    "panel_cnt": "Panel count",
    "risk_avg": "Panel complexity",
    "EWA_avg_teamwork_on_inbox_value":"Exp. weighted teamwork on inbox - value",
    "r_slope_panel_cnt":'Roll. slope panel count',
    "teamwork_on_inbox_value": "Teamwork on inbox - value",
    "gender": "Gender",
    "calendar_month": "Calendar month",
    "covid_wave": "Covid wave",
    "patient_volume": "Patient volume",
    "physician_demand": "Physician demand",
    'EWA_avg_order_time_8':'Exp. weighted order time',
    'EWA_avg_wow_time_8':'Exp. weighted work outside of work time',
    'EWA_avg_physician_demand':'Exp. weighted physician demand',
    'EWA_avg_ib_time_8':'Exp. weighted inbox time',
    'EWA_avg_note_time_8':'Exp. weighted note time',
    'EWA_avg_ehr_time_8':'Exp. weighted EHR time',
    'r_slope_wow_time_8':'Roll. slope work outside of work time',
    'EWA_avg_patient_volume': 'Exp. weighted patient volume',
    'physician_demand': 'physician demand',
}

In [None]:
explainer = shap.TreeExplainer(classify_xgb)
# explainer = shap.TreeExplainer(classify_xgb, feature_perturbation='tree_path_dependent', model_output="raw")
shap_values = explainer(X_train)
shap_ival = explainer.shap_interaction_values(X_train)
# all features summarized - figure 2
dc_shap_obj = deepcopy(shap_values)
dc_shap_obj.feature_names = [feature_name_dict[x] if x in feature_name_dict else x for x in dc_shap_obj.feature_names]
dc_shap_ival_obj = deepcopy(shap_ival)

### Tenure Tests
Here we fit a linear model to two variables: tenure and EWA_avg_ehr_time_8

In [None]:
class shap_quant:
    def __init__(self, shap_obj, shap_ival_obj, df, feature_name_dict, main_feat, interaction_feat):
        self.shap_obj = shap_obj
        self.df = df
        self.feature_name_dict = feature_name_dict
        self.main_feat = main_feat
        self.interaction_feat = interaction_feat
        self.SHO_main_feat = feature_name_dict[main_feat] if main_feat in feature_name_dict else main_feat
        self.SHO_interaction_feat = feature_name_dict[interaction_feat] if interaction_feat in feature_name_dict else interaction_feat
        self.shap_ival_obj = shap_ival_obj
        self.fit_ols()
    def fit_ols(self):
        self.shap_value = self.shap_obj[:,self.SHO_main_feat].values
        self.feature_value = self.df[self.main_feat].to_numpy()
        self.interaction_value = self.df[self.interaction_feat].to_numpy()
        model_ols = sm.OLS(self.feature_value,self.shap_value,missing="drop")
        model_ols_fit = model_ols.fit()
        self.model_ols_fit = model_ols_fit
        return model_ols_fit
    def is_linear(self) :# is it linear?
        res = linear_reset(self.model_ols_fit)
        print('Linear test: ', res)
        #return res
    '''
    # here is the plot of tenure shap value vs. value of the interaction feature
    # plt.scatter(tenure_sv[tenure_fv==5.0],tenure_iv[tenure_fv==5.0])
    # When sv=shap value, fv=feature value, iv=interaction value
    # so when the fv -s a value, x, we find the correlation coeeficient between the two to see if there is a positive or negative relationship
    # works by ranking the values of x and y, (lowest value = 1, highest value = n)
    # push this into the formula and you get the coeficcients.
    # so these all seem to have a positive relationship
    # It assesses how well the relationship between two variables can be described using a monotonic function.
    # so it looks at both the linear relationship between the variables but also how well it can be described by some monotonic function
    # if a var goes up either the other also goes up or the other goes down
    # here we can say as the relationship 
    '''
    def sv_iv_correlation(self): # check if the shap value and interaction value between the main feat and interaction feat
        vals = [x for x in np.unique(self.feature_value).tolist() if isinstance(x, (int,float)) and not np.isnan(x)]
        for v in vals:
            print(f'for value {v}', stats.spearmanr(self.shap_value[self.feature_value==v],self.interaction_value[self.feature_value==v]))
    '''
    # look at the correlation between the interaction shap value (i.e. the y
    # coordinate on plot) and the interaction feature value (i.e. the color of the
    # point) for a given primary feature value (i.e. given tenure)
    '''
    def isv_ifv_correlation(self):
        idx1 = X_train.columns.get_loc(self.main_feat)
        idx2 = X_train.columns.get_loc(self.interaction_feat)
        isv = self.shap_ival_obj[:,idx1,idx2] #isv interaction shap value
        fv = self.df[self.main_feat]
        ifv = self.df[self.interaction_feat] # ifv interaction feature value
        vals = [x for x in np.unique(self.feature_value).tolist() if isinstance(x, (int,float)) and not np.isnan(x)]
        for v in vals:
            print(f'for value {v}', stats.spearmanr(isv[fv==v],ifv[fv==v]))
        plt.scatter(fv,isv)

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'tenure', 'EWA_avg_ehr_time_8')
sq.is_linear()
sq.sv_iv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'EWA_avg_risk_avg', 'EWA_avg_physician_demand')
sq.is_linear()
sq.sv_iv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'age_group', 'EWA_avg_ehr_time_8')
sq.is_linear()
sq.sv_iv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'EWA_avg_physician_demand', 'panel_cnt')
sq.is_linear()
sq.sv_iv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'EWA_avg_physician_demand', 'panel_cnt')
sq.is_linear()
sq.sv_iv_correlation()

### lets look at the interaction shap and interaction feature values

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'tenure', 'EWA_avg_ehr_time_8')
sq.is_linear()
sq.isv_ifv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'tenure', 'EWA_avg_ib_time_8')
sq.is_linear()
sq.isv_ifv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'tenure', 'EWA_avg_order_time_8')
sq.is_linear()
sq.isv_ifv_correlation()

In [None]:
sq = shap_quant(dc_shap_obj,dc_shap_ival_obj, X_train, feature_name_dict, 'tenure', 'EWA_avg_note_time_8')
sq.is_linear()
sq.isv_ifv_correlation()