## Retention study analysis

In [None]:
 import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from matplotlib import pyplot as plt
import shap
from copy import deepcopy
import xgboost as xgb
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report, auc, roc_curve, precision_recall_curve, f1_score
import matplotlib.pyplot as plt
import plotly.express as px

SEED = 0
np.random.seed(SEED)
pd.set_option('display.max_columns', None)

## Data set preparation and XGboost Fit

Data set is built with retention_study_data_set_gen.ipynb

Create train/validation/test split on data. Right now doing single validation set, but could consider expanding to k-fold cross validation to tune one of the hyperparameters like scale positive weight or the prediction lead time. If we do this with lead time, this will require multiple versions of the data set to be built.



### Variable nomenclature

summary where * is train/test/val, N = number of month observations, F = number of features, P = number of physicians
| Variable | Description|
|----------|-------------|
| X_*       | (N x F) DataFrame of features|
| y_*       |(N x 1) Series of binary depart within interval|
| X_*_ids   |(N x 1) Series of physician ids for each month|
| *_ids     |(P x 2) Dataframe with [0,:] list of physician ids, [1,:] binary depart within study for physician |
| y_*_pred  |(N x1) series of binary predicitons |
| y_*_pred_prob | (Nx1) series of raw xgboost output |

In [None]:
ehr_data = pd.read_pickle('./data/processed/turbo_7_29_22_deid_processed_3_ROUND_5y_TENURE_NO_STUDYDAY.pkl')
ehr_data = ehr_data.drop(['provtype_Physician','reportingperiodstartdate'],axis=1)

In [None]:
ehr_data

In [None]:
mask = (ehr_data['specialty_Family Medicine']==0) & (ehr_data['specialty_Internal Medicine']==0) & (ehr_data['specialty_Pediatrics']==0)

In [None]:
# Apply mask to the data so we can NaN the panel count and panel complexity on specialty that is not
# specialty_Family, specialty_Internal, specialty_Pediatrics
ehr_data.loc[mask, 'panel_cnt'] = np.nan
ehr_data.loc[mask, 'risk_avg'] = np.nan

In [None]:
categorical_cols = [
    'physician_id',
    'age_group',
    'gender',
    'departure_in_interval',
    'calendar_month',
    'covid_wave'
]

In [None]:
all_ids = pd.DataFrame({
    'id':  ehr_data['physician_id'].unique(),
    'depart': ehr_data.groupby('physician_id')['departure_in_interval'].any().tolist()
})

In [None]:
# make train, test, validation
#   - 0.33 test set
#   - 0.25 validation
#   - perform on IDs to allow for stratification
'''test_split = 0.33
val_split = 0.25
val_n = 10 # in case we want to tune hyperparas

test_sss = StratifiedShuffleSplit(n_splits = 1, test_size = test_split, random_state = 0)
val_sss = StratifiedShuffleSplit(n_splits = val_n, test_size = val_split, random_state = 0)'''

In [None]:
# train+val / test split
'''tv_idxs, test_idxs = next(test_sss.split(all_ids[['id']],all_ids['depart']))
tv_ids, test_ids = all_ids.iloc[tv_idxs,:], all_ids.iloc[test_idxs,:]

# train / val split
train_idxs, val_idxs = next(val_sss.split(tv_ids[['id']],tv_ids['depart']))
train_ids, val_ids = tv_ids.iloc[train_idxs,:], tv_ids.iloc[val_idxs,:]'''

# train_id_status = id_status[id_status['id'].isin(pd.Series(train_ids))]
# val_id_status = id_status[id_status['id'].isin(pd.Series(val_ids))]

In [None]:
'''
assert ~any(tv_ids.id.isin(test_ids.id)), 'Bad train test split'
assert ~any(train_ids.id.isin(val_ids.id)), 'Bad train val split'
assert ~any(val_ids.id.isin(test_ids.id)), 'Bad val test split'
'''

In [None]:
'''X = ehr_data.drop(['departure_in_interval'], axis=1).copy()
y = ehr_data['departure_in_interval'].copy()'''

In [None]:
# test_flag = X['physician_id'].isin(test_ids['id'])
# val_flag= X['physician_id'].isin(val_ids['id'])
# train_flag= X['physician_id'].isin(train_ids['id'])

# X_test,y_test = X[test_flag], y[test_flag]
# X_val,y_val = X[val_flag], y[val_flag]
# X_train,y_train = X[train_flag], y[train_flag]

# # extract the ids for double checking, drop from X
# X_test_ids = X_test.pop('physician_id')
# X_val_ids = X_val.pop('physician_id')
# X_train_ids = X_train.pop('physician_id')

In [None]:
'''# positive fractions
print(sum(ehr_data.departure_in_interval)/len(ehr_data))
print(sum(y_test)/len(y_test))
print(sum(y_val)/len(y_val))
print(sum(y_train)/len(y_train))'''

In [None]:
'''print(f'num depart months: {sum(y)}')
print(f'total months: {len(y)}')'''

In [None]:
ID_train, ID_test = train_test_split(ehr_data['physician_id'].unique(),
                                                test_size=0.2, random_state=SEED)

In [None]:
assert len(set(ID_train).intersection(set(ID_test))) == 0, 'Bad split'

In [None]:
X_train = ehr_data[ehr_data['physician_id'].isin(ID_train)].drop(['departure_in_interval', 'physician_id'], axis=1)
X_test = ehr_data[ehr_data['physician_id'].isin(ID_test)].drop(['departure_in_interval', 'physician_id'], axis=1)
y_train = ehr_data[ehr_data['physician_id'].isin(ID_train)]['departure_in_interval']
y_test = ehr_data[ehr_data['physician_id'].isin(ID_test)]['departure_in_interval']



In [None]:
X_train.columns

In [None]:
X_save = ehr_data
y_save = ehr_data.pop('departure_in_interval')
X_save_ids = ehr_data.pop('physician_id')
save_ids = all_ids

# GridsearchCV -- XGBOOST

In [None]:
from sklearn.metrics import f1_score
import numpy as np

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred), average=None)
    return 'f1_err', err

In [None]:
classify_xgb = xgb.XGBClassifier(
    objective = 'binary:logistic',
    #missing = nan,
    seed = SEED,
    scale_pos_weight = 400, # approx 
    n_estimators=200
)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {#'nthread':[2], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.3, 0.4, 0.5], #so called `eta` value
              'max_depth': [x for x in range(4, 10)],
              #'scale_pos_weight':[100,400],
              'reg_lambda':[1,10,20,40]
              #'min_child_weight': [11],
              #'silent': [1],
              #'subsample': [0.8],
              #'colsample_bytree': [0.7],
              #'n_estimators': [5], #number of trees, change it to 1000 for better results
              #'missing':[-999],
              #'eval_metric': ['auc', f1_eval],
              #'seed': [SEED],
              #'verbose':[False]
            }


In [None]:
classify_xgb_GS = GridSearchCV(classify_xgb, parameters, n_jobs=1, 
                                scoring='roc_auc',#make_scorer(f1_score, average='binary'),#'roc_auc',
                                cv=5,
                                verbose=2, refit=True)

In [None]:
classify_xgb_GS

In [None]:
classify_xgb_GS.fit(X_train, y_train)

In [None]:
print(classify_xgb_GS.best_params_)
print(classify_xgb_GS.best_score_)

In [None]:
classify_xgb_GS.get_params

In [None]:
y_test_pred = classify_xgb_GS.predict(X_test)
y_test_pred_prob = classify_xgb_GS.predict_proba(X_test)

In [None]:
classify_xgb_save = classify_xgb_GS.best_estimator_.fit(X_train,y_train)

In [None]:
# get predictions for all data sets

'''y_test_pred = classify_xgb.predict(X_test)
y_train_pred = classify_xgb.predict(X_train)
y_val_pred = classify_xgb.predict(X_val)

y_test_pred_prob = classify_xgb.predict_proba(X_test)
y_train_pred_prob = classify_xgb.predict_proba(X_train)
y_val_pred_prob = classify_xgb.predict_proba(X_val)'''


### Model Performance Evaluation script

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["figure.figsize"] = (10,7)
def model_perf(classifier, X, y,crosstab=True, stats = True, roc_plot = True, optimal_thresh=True, custom_thresh = None):
    # simple helper to easily show some key features of performace
    
    y_pred = classifier.predict(X)
    probs = classifier.predict_proba(X)
    scores = probs[:,1]
    fpr, tpr, threshold = roc_curve(y, scores)
    roc_auc = auc(fpr, tpr)

    if crosstab:
        display(pd.crosstab(y,y_pred))
    
    if stats:
        print(classification_report(y, y_pred))
        print(get_stats(y, y_pred))

    # method I: plt
    if roc_plot:
        plt.title('Main Results ROC Curve and AUC')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        #plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        #plt.show()
    if optimal_thresh:
        opt_cutoff, ix = cutoff_youdens_j(fpr, tpr, threshold)#Find_Optimal_Cutoff(y, scores)[0]
        print('Optimal Threshold cutoff')
        #print(opt_cutoff)
        #ix = gmean(fpr, tpr, threshold)
        plt.plot(fpr[ix], tpr[ix], marker='o', color='black', label='Best Threshold (Youdens J Stat) =%f' % (opt_cutoff))
        #plt.text(2,4,'This text starts at point (2,4)')
        plt.vlines(fpr[ix], 0, 1, linestyles='dashed', color='black')
        print(classification_report(y, scores > opt_cutoff))
        display(pd.crosstab(y,scores > opt_cutoff))
        print(get_stats(y, scores > opt_cutoff))
    if custom_thresh is not None:
        for ct in custom_thresh:
            print('Custom Thresh')
            print(ct)
            print(classification_report(y, scores > ct))
            display(pd.crosstab(y,scores > ct))
            print(get_stats(y, scores > ct))
    plt.legend(loc = 'lower right')
    plt.show()
def cutoff_youdens_j(fpr,tpr,thresholds):
    j_scores = tpr-fpr
    ix = np.argmax(j_scores)
    best_thresh = thresholds[ix]
    print('Best Threshold (Youdens J Stat)=%f' % (best_thresh))
    j_ordered = sorted(zip(j_scores,thresholds))
    return j_ordered[-1][1], ix

def gmean(fpr,tpr,thresholds):
    # calculate the g-mean for each threshold
    gmeans = np.sqrt(tpr * (1-fpr))
    # locate the index of the largest g-mean
    ix = np.argmax(gmeans)
    print('Best GMeans Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
    return ix

def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
            TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
            FP += 1
        if y_actual[i]==y_hat[i]==0:
            TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
            FN += 1
    return(TP, FP, TN, FN)
def get_stats(true_value, classifier_output):
    # we need sensitivity, specificity, npv, ppv for all 3 thresholds
    # Note that in binary classification, recall of the positive class 
    # is also known as “sensitivity”; recall of the negative class is “specificity”.
    TN, FP, FN, TP = confusion_matrix(true_value, classifier_output).ravel() #perf_measure(true_value, classifier_output)# confusion_matrix(true_value, classifier_output).ravel()
    ppv = TP/(TP+FP)
    npv = TN/(TN+FN)
    specificity = TN/(TN+FP)
    sensitivity = TP/(TP+FN)
    return {'ppv': ppv, 'npv': npv, 'specificity': specificity, 'sensitivity': sensitivity}
    
    

In [None]:
#model_perf(classify_xgb,X_train,y_train,stats=False,roc_plot=False, optimal_thresh=False, crosstab=False)
#model_perf(classify_xgb,X_val,y_val,stats=False,roc_plot=False, optimal_thresh=False, crosstab=False)
model_perf(classify_xgb_GS,X_test,y_test,stats=True,roc_plot=True, optimal_thresh=True, crosstab=True)# custom_thresh=[0.8, 0.001]

In [None]:
# Lets store results
import pickle
# with open('./models/xgb_classifier_train_test_without_specialty.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump([classify_xgb_GS,X_train,y_train,X_test,y_test, y_test_pred, y_test_pred_prob], f)

train_list = [classify_xgb_save,X_save,y_save,X_save_ids,save_ids]
fpath = './models/xgb_classifier_train_test_without_specialty_5y.pkl'

with open(fpath,"wb") as open_file:
    pickle.dump(train_list,open_file)

In [None]:
X_train

# ROC curve

In [None]:
# Roc Curve
fpr, tpr, threshold = roc_curve(y_test, y_test_pred_prob[:,1])
roc_auc = auc(fpr, tpr)
# precision recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred_prob[:,1])
# calculate precision-recall AUC
pr_auc = auc(recall, precision)

In [None]:
# try bootstrap method:
#found here: https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
# and: https://stackoverflow.com/questions/52373318/how-to-compare-roc-auc-scores-of-different-binary-classifiers-and-assess-statist
from copy import deepcopy
from sklearn.utils import resample
import numpy as np
from collections import defaultdict
params = deepcopy(classify_xgb_GS.best_params_)
# configure bootstrap
def bootstrap_auc(X_train, y_train, X_test, y_test, nsamples=1000):
    statistics = {}
    for b in range(nsamples):
        idx = np.random.randint(X_train.shape[0], size=X_train.shape[0])
        clf = xgb.XGBClassifier(
            objective = 'binary:logistic',
            #seed = SEED,
            scale_pos_weight = 400, # approx 
            n_estimators=200
            )
        clf.set_params(**params)
        clf.fit(X_train.iloc[idx], y_train.iloc[idx], eval_metric='logloss')
        pred = clf.predict_proba(X_test)[:, 1]
        #roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
        fpr, tpr, threshold = roc_curve(y_test.ravel(), pred.ravel())
        roc_auc = auc(fpr, tpr)
        precision, recall, thresholds = precision_recall_curve(y_test.ravel(), pred.ravel())
        pr_auc = auc(recall, precision)
        statistics[b] = {'roc_auc': roc_auc, 'fpr': fpr, 'tpr': tpr, 'precision': precision, 'recall': recall, 'pr_auc':pr_auc}
        
    return statistics#np.percentile(auc_values, (2.5, 97.5))

In [None]:
bootstrap_stats = bootstrap_auc(X_train, y_train, X_test, y_test, nsamples=200)

In [None]:
roc_aucs = []
pr_aucs = []
for i in range(len(bootstrap_stats)):
    roc_aucs.append(bootstrap_stats[i]['roc_auc'])
    pr_aucs.append(bootstrap_stats[i]['pr_auc'])
roc_CI = np.percentile(roc_aucs, (2.5, 97.5))
pr_CI = np.percentile(pr_aucs, (2.5, 97.5))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))
ax1.set_title('Receiver Operating Characteristic')
ax1.plot(fpr, tpr, 'b', label = 'AUC = {:.2f} [CI={:.3f},{:.3f}]'.format(roc_auc, roc_CI[0], roc_CI[1]))
#plt.plot(bootstrap_stats[maxIDX]['fpr'], bootstrap_stats[maxIDX]['tpr'], 'r')
#plt.plot(bootstrap_stats[minIDX]['fpr'], bootstrap_stats[minIDX]['tpr'], 'r')
ax1.fill_between(fpr, (tpr-(roc_CI[1]- roc_CI[0])), (tpr+(roc_CI[1]- roc_CI[0])), alpha=0.2)
ax1.legend(loc = 'lower right')
ax1.plot([0, 1], [0, 1],'r--')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])
ax1.set_ylabel('True Positive Rate')
ax1.set_xlabel('False Positive Rate')
ax2.set_title('Precision Recall Curve')
ax2.plot(recall, precision, 'b', label = 'AUC = {:.2f} [CI={:.3f},{:.3f}]'.format(pr_auc, pr_CI[0], pr_CI[1]))
# fill between
ax2.fill_between(recall, (precision-(pr_CI[1]- pr_CI[0])), (precision+(pr_CI[1]- pr_CI[0])), alpha=0.2)
# calculate the no skill line as the proportion of the positive class
no_skill = len(y_test[y_test==1]) / len(y_test)# Essentially the fraction of positive classes/total number of examples
# plot the no skill precision-recall curve
ax2.plot([0, 1], [no_skill, no_skill], 'r--', label='No Skill = %0.2f' % no_skill)
ax2.legend(loc = 'best')
ax2.set_xlim([0, 1])
ax2.set_ylim([0, 1])
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
plt.show()

In [None]:
ehr_data_ = pd.read_pickle('./data/processed/turbo_7_29_22_deid_processed_3_ROUND_5y_TENURE_NO_STUDYDAY.pkl')

In [None]:
len(ehr_data_[ehr_data_['departure_in_interval'] == True]['physician_id'].unique())

In [None]:
44/len(ehr_data_['physician_id'].unique())

# END

In [None]:
def expanded_crosstab(y,y_pred,id,id_status):
    # expanded crosstab information showing classificatin performance by physician departure status
    # y -> y_test (series)
    # y_pred -> y_pred (series)
    # ids -> X_test_ids (series)
    # id_status -> test_ids (dataframe)

    # test data for function validation 
    # y = pd.Series(np.array([False,False,False,False,False,False,False,False,True,True,False,True]))
    # y_pred = pd.Series(np.array([False,False,False,True,False,False,False,False,True,True,False,True]))
    # id = pd.Series(np.array([1,1,1,1,1,1,2,2,2,2,2,2]))
    # id_status = pd.DataFrame({'id': [1,2],'depart': [False,True]})
    # expanded_crosstab(y,y_pred,id,id_status)
    # out:
    #     class	total	never dep	dep
    # 0	    TN	    8	    5	     3
    # 1	    FP	    1	    1	     0
    # 2	    FN	    0	    0	     0
    # 3	    TP	    3	    0	     3

    total_ct = pd.crosstab(y,y_pred)
    total_ct = total_ct.reindex(columns=[False,True],index = [False,True],fill_value=0)

    never_dep_mask = id.isin(id_status[~id_status['depart']]['id'])
    dep_mask = id.isin(id_status[id_status['depart']]['id'])

    never_dep_ct = pd.crosstab(y[never_dep_mask],y_pred[never_dep_mask])
    never_dep_ct = never_dep_ct.reindex(columns=[False,True],index = [False,True],fill_value=0)

    dep_ct = pd.crosstab(y[dep_mask],y_pred[dep_mask])
    dep_ct = dep_ct.reindex(columns=[False,True],index = [False,True],fill_value=0)

    expanded_crosstab = pd.DataFrame(
        {
            'class': ['TN','FP','FN','TP'],
            'total': total_ct.values.flatten(),
            'never dep': never_dep_ct.values.flatten(),
            'dep': dep_ct.values.flatten()
        }
    )

    return(expanded_crosstab)




A false postive month just before the physician enters the "n-month to departure" interval probably has a different meaning than a false positive month far away from the departure date. There is a much higher rate of false positive months for physicians who go on to quit. This is likely because some features predictive of quitting are present even before our departure interval.


In [None]:
ex_crosstab = expanded_crosstab(y_test,y_test_pred,X_test_ids,test_ids)
display(ex_crosstab)

In [None]:
print(ex_crosstab.iloc[1,3]/(ex_crosstab.iloc[0,3]+ex_crosstab.iloc[1,3]))
print(ex_crosstab.iloc[1,2]/(ex_crosstab.iloc[0,2]+ex_crosstab.iloc[1,2]))
print(ex_crosstab.iloc[1,1]/(ex_crosstab.iloc[0,1]+ex_crosstab.iloc[1,1]))

### Physician level model Performance

In [None]:
def compile_physician_data(X,y,y_pred,y_prob,X_ids,ids):
    P = X[['study_day']].copy()
    P['id'] = X_ids
    P['prob'] = y_prob[:,1]
    P['pred'] = y_pred
    P['depart'] = y
    P['phys_depart'] = P.id.isin(ids[ids['depart']]['id'])
    P['month_sync'] = P.groupby('id')['study_day'].transform(lambda x: round((x-max(x))/30))
    P['prob_rm'] = P.groupby('id')['prob'].rolling(3).mean().to_list()
    return(P)

In [None]:
P_test = compile_physician_data(X_test,y_test,y_test_pred,y_test_pred_prob,X_test_ids,test_ids)
P_train = compile_physician_data(X_train,y_train,y_train_pred,y_train_pred_prob,X_train_ids,train_ids)
P_val = compile_physician_data(X_val,y_val,y_val_pred,y_val_pred_prob,X_val_ids,val_ids)

In [None]:
fig = px.line(P_test, x = 'month_sync', y='prob', line_group='id',color='phys_depart')
fig.show()

# Save the model, and training data

In [None]:
import pickle
with open('./models/xgb_classifier_train_without_specialty.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([classify_xgb,X_train,y_train,X_train_ids,train_ids], f)

In [None]:
# Shap and stuff will go elsewhere

In [None]:
explainer = shap.TreeExplainer(classify_xgb, X_train, feature_perturbation='interventional', model_output="probability")
shap_values = explainer.shap_values(X_train)
shap_obj = explainer(X_train)
shap_interaction_values = shap.TreeExplainer(classify_xgb).shap_interaction_values(X_train)
shap_expected = explainer.expected_value

In [None]:
shap_train_sum = np.sum(shap_obj.values,axis=1)+explainer.expected_value

comp_prob_shap = pd.DataFrame({
    'xgb': y_train_pred_prob[:,1],
    'shap': shap_train_sum
})

assert (comp_prob_shap.xgb-comp_prob_shap.shap).max() <0.00001, 'shap score mismatch'

In [None]:
# shap.initjs()
# query_id = 298
# id_locs = X_train_ids==query_id
# shap.plots.force(explainer.expected_value, shap_values.values[id_locs,:],X_train.loc[id_locs],link='logit')

In [None]:
shap.plots.beeswarm(deepcopy(shap_obj), max_display=40, plot_size=1)

In [None]:
# We use this function to get a true positive sample and a true negative sample for a specific condition
rows = []
for select_id in set(X_train_ids):
    # we use study day as we want to use the last study day to figure out if a person who matches the criteria
    idx = X_train[X_train_ids == select_id]['study_day'].sort_values(ascending=False).head(1).index
    rows.append(X_train.loc[idx])
select_df = pd.concat(rows)
# this is our condition
select_df = select_df[select_df['patient_volume'] < X_train['patient_volume'].median()]
select_df['y'] = y_train[select_df.index]

tp_idx = X_train.index.tolist().index(select_df[select_df['y'] == True].sample(n=1).index)
tn_idx = X_train.index.tolist().index(select_df[select_df['y'] == False].sample(n=1).index)
print(tp_idx)
print(tn_idx)

In [None]:
# This is for the figure
median_feat = ['{}: ({})'.format(col, str( np.round(med, 1))) for col, med in zip(X_train.columns.tolist(), X_train.median())]

In [None]:
# A true positive example
print(y_train.iloc[tp_idx])# confirm that its a true positive by looking at the y...
plot = shap.decision_plot(explainer.expected_value,shap_values[tp_idx,:],X_train.iloc[tp_idx,:], feature_names=median_feat, link='identity', return_objects=True, feature_display_range=slice(-1, -26, -1))

In [None]:
# A true negative example
print(y_train.iloc[tn_idx])
shap.decision_plot(explainer.expected_value,shap_values[tn_idx,:],X_train.iloc[tn_idx,:],feature_names=median_feat, feature_display_range=slice(-1, -26, -1))# in parenthesis is the actual value

In [None]:
X_train.loc[tn_idx]['age_group']

In [None]:
shap.dependence_plot("physician_demand", shap_values, X_train)
'''
This is a plot for risk_avg and its interaction with age_group. Here we see that with low risk_avg our model tends to push
towards a 0 and as risk_avg goes up it tends to not push the model towards a 1 even if the physician is older.
We can also see that at age_group high and risk_avg==2 (complexity) there is a lot of interaction and if the age group is high then they are more likely to depart (1)
This makes sense because we can see that in the bswarm plot if there is low complexity (ris_avg == 0) its more protective, 
while middle risk average is more indicative of a 1 (depart) and high risk_avg doesnt really push the model towards a 1 or 0
so a physician with low patient complexity and high age may be indicative of someone retiring?
'''

In [None]:
X_train.columns.tolist()

In [None]:
shap.dependence_plot("risk_avg", shap_values, X_train)
'''
This is a plot for risk_avg and its interaction with age_group. Here we see that with low risk_avg our model tends to push
towards a 0 and as risk_avg goes up it tends to not push the model towards a 1 even if the physician is older.
We can also see that at age_group high and risk_avg==2 (complexity) there is a lot of interaction and if the age group is high then they are more likely to depart (1)
This makes sense because we can see that in the bswarm plot if there is low complexity (ris_avg == 0) its more protective, 
while middle risk average is more indicative of a 1 (depart) and high risk_avg doesnt really push the model towards a 1 or 0
so a physician with low patient complexity and high age may be indicative of someone retiring?
'''

In [None]:
shap.dependence_plot("panel_cnt", shap_values, X_train)
'''
Here panel count is interesting because as we move up in panel count the probability starts off as high for
departing (target variable 1) and as it increases to about 1000 it switches to a protective feature 
where as the panel count increases the probability of departure decreases

This plot shows that the variable it interacts the most with is tenure. For tenure we see that with mid to large panel count
we have some high tenure folk showing up. This may imply that the folks who have high panel count and high tenure interaction tend to be 
protective (retained) we can also see that high tenure does not really impact the model too much
Positive SHAP value means positive impact on prediction, leading the model to predict 1
'''

In [None]:
shap.plots.scatter(shap_values[:,'age_group'], color = shap_values[:,'risk_avg']) # male is 0


In [None]:
shap_interaction_values = shap.TreeExplainer(classify_xgb).shap_interaction_values(X_train)

In [None]:
shap.summary_plot(shap_interaction_values, X_train)

In [None]:
shap.dependence_plot(('age_group','gender'), shap_interaction_values, X_train, display_features=X_train)

In [None]:
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import MaxNLocator

cdict1 = {
    'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
            (1.0, 0.9607843137254902, 0.9607843137254902)),

    'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
              (1.0, 0.15294117647058825, 0.15294117647058825)),

    'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
             (1.0, 0.3411764705882353, 0.3411764705882353)),

    'alpha': ((0.0, 1, 1),
              (0.5, 1, 1),
              (1.0, 1, 1))
}  # #1E88E5 -> #ff0052
red_blue_solid = LinearSegmentedColormap('RedBlue', cdict1)

In [None]:
shap_embedded = TSNE(n_components=2, perplexity=75).fit_transform(shap_values.values)

In [None]:
shap_embedded.shape

In [None]:


f = plt.figure(figsize=(10,10))
plt.scatter(shap_embedded[:,0],
           shap_embedded[:,1],
           c=explainer.expected_value+shap_values.values.sum(1).astype(np.float64),
           linewidth=0, alpha=1., cmap=red_blue_solid)
for id in train_ids[train_ids.depart]['id']:
    cur_ids = X_train_ids==id
    plt.plot(shap_embedded[cur_ids,0],
           shap_embedded[cur_ids,1],
           c='#555555',
           linewidth = 2,
           alpha = 0.5
    )
# only log odds when you set shap to have raw (its an xgboost thing)
cb = plt.colorbar(label="Probability departure_in_interval (6 months)", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
plt.gca().axis("off")
plt.show()

In [None]:
test_id_status[test_id_status.depart]['id']


In [None]:
colors = np.array(X_test.study_day/X_test.study_day.max())

In [None]:
lst = ['age_group',
 'gender',
 'calendar_month',
 'covid_wave',
 'patient_volume',
 'physician_demand',
 'physician_work_intensity',
 'panel_cnt',
 'risk_avg',
 'teamwork_on_inbox_value',
 'note_quality_manual_value',
 'note_quality_contribution_value',
 'number_of_rx_errors',
 'ehr_time_8',
 'wow_time_8',
 'note_time_8',
 'order_time_8',
 'ib_time_8',
 'review_time_8',
 'tenure',
 'study_day',
 'specialty_Cardiovascular Disease',
 'specialty_Endocrinology, Diabetes & Metabolism',
 'specialty_Family Medicine',
 'specialty_Gastroenterology',
 'specialty_Internal Medicine',
 'specialty_Obstetrics and Gynecology',
 'specialty_Other Medical Subspecialty',
 'specialty_Pediatrics',
 'specialty_Pulmonary Disease',
 'specialty_Rheumatology',
 'specialty_Surgery',
 'specialty_Surgical Subspecialty',
 'EWA_avg_patient_volume',
 'EWA_avg_physician_demand',
 'EWA_avg_physician_work_intensity',
 'EWA_avg_panel_cnt',
 'EWA_avg_risk_avg',
 'EWA_avg_teamwork_on_inbox_value',
 'EWA_avg_note_quality_manual_value',
 'EWA_avg_note_quality_contribution_value',
 'EWA_avg_number_of_rx_errors',
 'EWA_avg_ehr_time_8',
 'EWA_avg_wow_time_8',
 'EWA_avg_note_time_8',
 'EWA_avg_order_time_8',
 'EWA_avg_ib_time_8',
 'EWA_avg_review_time_8',
 'r_slope_patient_volume',
 'r_slope_physician_demand',
 'r_slope_physician_work_intensity',
 'r_slope_panel_cnt',
 'r_slope_risk_avg',
 'r_slope_teamwork_on_inbox_value',
 'r_slope_note_quality_manual_value',
 'r_slope_note_quality_contribution_value',
 'r_slope_number_of_rx_errors',
 'r_slope_ehr_time_8',
 'r_slope_wow_time_8',
 'r_slope_note_time_8',
 'r_slope_order_time_8',
 'r_slope_ib_time_8',
 'r_slope_review_time_8']

In [None]:
lst2 = ['prov_id', 'date_yyyymm_x', 'rv_us', 'mean', 'merge_id', 'date_yyyymm_y', 'reporting_period_start_date', 'reporting_period_end_date', 'specialty', 'prov_type', 'age_as_of_06_30_2021', 'gender', 'hire_date', 'term_date', 'sched_time_avail_in_hrs', 'sched_time_appts_in_hrs', 'sched_time_compl_in_hrs', 'actual_prov_visit_time_in_hrs', 'time_on_unscheduled_days_num', 'time_on_unscheduled_days_denom', 'time_on_unscheduled_days_value', 'time_in_notes_per_day_num', 'time_in_notes_per_day_denom', 'time_in_notes_per_day_value', 'time_outside_scheduled_hours_num', 'time_outside_scheduled_hours_denom', 'time_outside_scheduled_hours_value', 'time_in_orders_per_day_num', 'time_in_orders_per_day_denom', 'time_in_orders_per_day_value', 'time_in_clinical_review_per_day_num', 'time_in_clinical_review_per_day_denom', 'time_in_clinical_review_per_day_value', 'time_in_in_basket_per_day_num', 'time_in_in_basket_per_day_denom', 'time_in_in_basket_per_day_value', 'orders_with_team_contributions_num', 'orders_with_team_contributions_denom', 'orders_with_team_contributions_value', 'time_in_system_per_day_num', 'time_in_system_per_day_denom', 'time_in_system_per_day_value', 'patient_volume', 'number_of_clinical_hours_scheduled', 'physician_demand', 'physician_work_intensity', 'panel_cnt', 'risk_avg', 'teamwork_on_inbox_num', 'teamwork_on_inbox_denom', 'teamwork_on_inbox_value', 'note_quality_manual_num', 'note_quality_manual_denom', 'note_quality_manual_value', 'note_quality_contribution_num', 'note_quality_contribution_denom', 'note_quality_contribution_value', 'total_time_on_ehr_outside_of_scheduled_num', 'total_time_on_ehr_outside_of_scheduled_denom', 'total_time_on_ehr_outside_of_scheduled_value', 'inbox_volume_total_messages_num', 'inbox_volume_total_messages_denom', 'inbox_volume_total_messages_value', 'number_of_rx_errors', 'physician_id', 'ehr_time_8', 'wow_time_8', 'note_time_8', 'order_time_8', 'ib_time_8', 'review_time_8', 'time_to_departure', 'departure_in_interval', 'tenure', 'study_day', 'calendar_month', 'age_group', 'covid_wave', 'EWA_avg_patient_volume', 'EWA_avg_physician_demand', 'EWA_avg_physician_work_intensity', 'EWA_avg_panel_cnt', 'EWA_avg_risk_avg', 'EWA_avg_teamwork_on_inbox_value', 'EWA_avg_note_quality_manual_value', 'EWA_avg_note_quality_contribution_value', 'EWA_avg_number_of_rx_errors', 'EWA_avg_ehr_time_8', 'EWA_avg_wow_time_8', 'EWA_avg_note_time_8', 'EWA_avg_order_time_8', 'EWA_avg_ib_time_8', 'EWA_avg_review_time_8']

In [None]:
lst.sort()

In [None]:
lst2.sort()

In [None]:
len(lst2)

In [None]:
set(lst) - (set(lst2))