In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
import math

import importlib
import ipynb.fs.full.postprocessing3_collect
import scipy.stats as st

from scipy import stats, optimize

from statsmodels.regression.linear_model import WLS
from statsmodels.genmod.generalized_linear_model import GLM

from statsmodels.stats.meta_analysis import (
    effectsize_smd,
    effectsize_2proportions,
    combine_effects,
    _fit_tau_iterative,
    _fit_tau_mm,
    _fit_tau_iter_mm,
)

In [None]:
def plot_importance(df, ax=None, height=0.2,
                    xlim=None, ylim=None,
                    xlabel='score', ylabel='Feature', fmap='',
                    importance_type='auc', max_num_features=None,
                    grid=True, show_values=True, 
                    error=False, importance_type_down = '', importance_type_up = '', **kwargs):

    title = importance_type    
    
    if error:
        df = df.sort_values(by=importance_type, ascending=True)
        labels = df[ylabel].to_numpy()
        values = df[importance_type].to_numpy()
        xerr = df[[importance_type_down, importance_type_up]].to_numpy().T
        if ax is None:
            _, ax = plt.subplots(1, 1)

        ylocs = np.arange(len(values))
        ax.barh(ylocs, values, align='center', height=height, xerr=xerr, capsize=10, **kwargs)        
    else:
        importance = (df >> select(ylabel, importance_type)).set_index('Feature').to_dict()[importance_type]
        tuples = [(k, importance[k]) for k in importance]
        if max_num_features is not None:
            # pylint: disable=invalid-unary-operand-type
            tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:]
        else:
            tuples = sorted(tuples, key=lambda x: x[1])
        labels, values = zip(*tuples)

        if ax is None:
            _, ax = plt.subplots(1, 1)

        ylocs = np.arange(len(values))
        ax.barh(ylocs, values, align='center', height=height, **kwargs)

    if show_values is True:
        for x, y in zip(values, ylocs):
            ax.text(x + x/25, y, round(x,2), va='center')

    ax.set_yticks(ylocs)
    ax.set_yticklabels(labels)

    if xlim is not None:
        if not isinstance(xlim, tuple) or len(xlim) != 2:
            raise ValueError('xlim must be a tuple of 2 elements')
    else:
        xlim = (0, max(values) * 1.2)
    ax.set_xlim(xlim)

    if ylim is not None:
        if not isinstance(ylim, tuple) or len(ylim) != 2:
            raise ValueError('ylim must be a tuple of 2 elements')
    else:
        ylim = (-1, len(values))
    ax.set_ylim(ylim)

    if title is not None:
        ax.set_title(title)
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    ax.grid(grid)
    return ax

In [None]:
def top_n_SHAP(result, site, year, importance_type = 'Importances', max_num_features = 10, numgraphcol=2):
    
    shap_data = result >> mask(X.site==site) >> mask(X.year==year)    
    #Print top_n SHAP plot
#    importance_type = 'Importances'
    A = (shap_data >> select('Feature', importance_type)).set_index('Feature').to_dict()[importance_type]
    topf_n = sorted(A, key=A.get, reverse=True)[:max_num_features]
    
    plotindex=0
    plt.clf()    
    fltrow = math.ceil(max_num_features/numgraphcol)
#    fig = plt.figure(figsize=(9,4.5*fltrow))
    fig = plt.figure(figsize=(22.5,9))
    
    for f in topf_n:
        plot_data = shap_data >> mask(X.Feature == f) >> select(X.fval, X.mean_val, X.se_val)
#        plt.figure()    
        plotindex = plotindex+1
        plt.subplot(fltrow, numgraphcol, plotindex)
        plt.scatter(x=plot_data['fval'],y=plot_data['mean_val'])
        plt.errorbar(plot_data['fval'],plot_data['mean_val'], yerr=plot_data['se_val'], fmt="o")
        myimp = (shap_data >> mask(X.Feature == f))['Importances'].iloc[0]
        plt.title(f+'(' + str(round(myimp,2)) + ')')
        # if plot_data.shape[0] > 2:
        #     spl = np.polynomial.legendre.Legendre.fit(plot_data['fval'], plot_data['mean_val'],5, full=True)
        #     [spline_x, spline_y] = spl[0].linspace()
#            plt.plot(spline_x, spline_y)      
        plt.grid()
    plt.show()
    return fig, topf_n
    #    plt.savefig('data/'+site+'/model_'+site+'_'+str(year)+'_'+f+'.png')    

In [None]:
def one_feature_SHAP_allyear_allsite(shap_data, feature, sites=None, numgraphcol=5, ylim_range=None):
    #Print top_n SHAP plot
    shap_data = shap_data >> mask(X.Feature == feature)
    if sites is None:
        sites = shap_data['site'].unique()
    years = shap_data['year'].unique()
    years.sort()
    
    plotindex=0
    plt.clf()    
    fltrow = math.ceil(len(sites)/numgraphcol)
#    fig = plt.figure(figsize=(18/numgraphcol,9/numgraphcol*fltrow))
    fig = plt.figure(figsize=(22.5,9))

    for site in sites:
        plotindex = plotindex+1
        plt.subplot(fltrow, numgraphcol, plotindex)    
        shap_dataX = shap_data >> mask(X.site == site)
        for yr in years:
            plot_data = shap_dataX >> mask(X.year == yr) >> select(X.fval, X.mean_val, X.se_val)
    #        plt.figure()    
            plt.scatter(x=plot_data['fval'],y=plot_data['mean_val'])
            plt.errorbar(plot_data['fval'],plot_data['mean_val'], yerr=plot_data['se_val'], fmt="o")
#            if plot_data.shape[0] > 2:
#                spl = np.polynomial.legendre.Legendre.fit(plot_data['fval'], plot_data['mean_val'],5, full=True)
#                [spline_x, spline_y] = spl[0].linspace()
#    #            plt.plot(spline_x, spline_y)                 
        myimp = shap_dataX['Importances'].iloc[0]
        plt.title(site+"_"+feature+'(' + str(round(myimp,2)) + ')')
        plt.grid()
        if not (ylim_range is None):
            plt.ylim(ylim_range)
    plt.show()
    return fig
#    plt.savefig('allsite'+f+'.png')

In [None]:
def one_feature_SHAP(shap_data, feature, site, numgraphcol=2):
    #Print top_n SHAP plot
    shap_dataX = shap_data >> mask(X.site == site) >> mask(X.Feature == feature)
    years = shap_dataX['year'].unique()
    years.sort()
    
    plotindex=0
    plt.clf()    
    fltrow = math.ceil(len(years)/numgraphcol)
#    fig = plt.figure(figsize=(9,4.5*fltrow))
    fig = plt.figure(figsize=(22.5,9))
        
    for yr in years:
        plot_data = shap_dataX >> mask(X.year == yr) >> select(X.fval, X.mean_val, X.se_val)
#        plt.figure()    
        plotindex = plotindex+1
        plt.subplot(fltrow, numgraphcol, plotindex)
        plt.scatter(x=plot_data['fval'],y=plot_data['mean_val'])
        plt.errorbar(plot_data['fval'],plot_data['mean_val'], yerr=plot_data['se_val'], fmt="o")
        myimp = shap_dataX['Importances'].iloc[0]
        plt.title(site+"_"+feature+'(' + str(round(myimp,2)) + ')')
#         if plot_data.shape[0] > 2:
#             spl = np.polynomial.legendre.Legendre.fit(plot_data['fval'], plot_data['mean_val'],5, full=True)
#             [spline_x, spline_y] = spl[0].linspace()
#            plt.plot(spline_x, spline_y)      
        plt.grid()
    plt.show()
    return fig    
    #    plt.savefig('data/'+site+'/model_'+site+'_'+str(year)+'_'+f+'.png')

In [None]:
def zero_feature_SHAP(shap_data, feature, site, yr, vline=[], vlinelabel=[]):     
    fig = plt.figure()    
    cmap = ['r', 'b', 'g', 'y', 'c']
    shap_dataX = shap_data >> mask(X.site == site) >> mask(X.Feature == feature)
    plot_data = shap_dataX >> mask(X.year == yr) >> select(X.fval, X.mean_val, X.se_val)
    plt.scatter(x=plot_data['fval'],y=plot_data['mean_val'])
    plt.errorbar(plot_data['fval'],plot_data['mean_val'], yerr=plot_data['se_val'], fmt="o")
#    plt.vlines(vline, ymin=plot_data['mean_val'].min(), ymax=plot_data['mean_val'].max(), label=vlinelabel, colors=cmap[:len(vlinelabel)])
    for i in range(len(vline)):
        plt.vlines(vline[i], ymin=plot_data['mean_val'].min(), ymax=plot_data['mean_val'].max(), label=vlinelabel[i], colors='r')        
#    plt.legend()
    plt.title(site+"_"+feature+"_"+str(yr))
    plt.grid()
    plt.show()
    return fig
    #    plt.savefig('data/'+site+'/model_'+site+'_'+str(year)+'_'+f+'.png')

In [None]:
def model_comparison(model1, model2, stg='stg01', site = '', year='2016', oversample='raw', fs='rmscrbun', rmcol='005'):
    import ipynb.fs.full.postprocessing3_collect
    import importlib
    importlib.reload(ipynb.fs.full.postprocessing3_collect)
    data1 = ipynb.fs.full.postprocessing3_collect.result_split(model1, stg=stg, site =site, year=year, oversample=oversample, fs=fs, rmcol=rmcol, return_result=True)
    data2 = ipynb.fs.full.postprocessing3_collect.result_split(model2, stg=stg, site =site, year=year, oversample=oversample, fs=fs, rmcol=rmcol, return_result=True)    
    
    data1 = list(data1.loc[:, ['site', 'auc']].sort_values('site').to_records(index=False))
    data2 = list(data2.loc[:, ['site', 'auc']].sort_values('site').to_records(index=False))
    labels1, values1 = zip(*data1)
    labels2, values2 = zip(*data2)
    
    if ax is None:
        _, ax = plt.subplots(1, 1)

    ylocs1 = np.arange(len(values1))
    ylocs2 = np.arange(len(values2))    
    ax.barh(ylocs1, values1, align='center', height=height, label=model1, **kwargs)
    ax.barh(ylocs2, values2, align='center', height=height, label=model2, **kwargs)

    if show_values is True:
        for x, y in zip(values1, ylocs1):
            ax.text(x + x/100, y, round(x,2), va='center')
        for x, y in zip(values2, ylocs2):
            ax.text(x + x/100, y, round(x,2), va='center')

    ax.set_yticks(ylocs1)
    ax.set_yticklabels(labels)

    if xlim is not None:
        if not isinstance(xlim, tuple) or len(xlim) != 2:
            raise ValueError('xlim must be a tuple of 2 elements')
    else:
        xlim = (0, max(values) * 1.1)
    ax.set_xlim(xlim)

    if ylim is not None:
        if not isinstance(ylim, tuple) or len(ylim) != 2:
            raise ValueError('ylim must be a tuple of 2 elements')
    else:
        ylim = (-1, len(values))
    ax.set_ylim(ylim)

    if title is not None:
        ax.set_title(title)
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    ax.grid(grid)
    return ax    

In [None]:
def top_features(shap_data, importance_type = 'Importances', max_num_features = 10):
#    siteyr = shap_data['siteyr'].unique()
    siteyrlen = shap_data.loc[:,['site', 'year']].drop_duplicates().shape[0]
    #    years.sort()
    rank_table = shap_data.sort_values(['site', 'year', importance_type], ascending=False).loc[:,['site', 'year', 'Feature']].drop_duplicates().groupby(['site', 'year']).head(max_num_features).reset_index(drop=True)
    rank_table.loc[:, 'rank'] = list(range(1,max_num_features+1))*siteyrlen
    rank_table = rank_table.pivot(index=['site', 'year'], columns='rank', values='Feature')
    return rank_table   

In [None]:
importlib.reload(ipynb.fs.full.postprocessing3_collect)

#Load statistics
stg = 'stg23'
fs = 'nofs'
#stgs = ["stg01", "stg123"]
#fss =  ['nofs', 'rmscrbun']
oversample='raw'
model = 'catd'    
rmcol = '005'
year = '3000'

#ipynb.fs.full.postprocessing3_collect.result_split(model, stg=stg, site = '', year='', oversample=oversample, fs=fs, rmcol=rmcol, return_result=False)
#ipynb.fs.full.postprocessing3_collect.DEID(model, stg=stg, site = '', year=year, oversample=oversample, fs=fs, rmcol=rmcol, return_result=False)
result = pd.read_pickle('DEID_resultsplit_'+model+'_'+stg+'_'+year+'_'+fs+'_'+oversample+'_005.pkl')

In [None]:
result['ckd_group'].drop_duplicates()

In [None]:
def featuredecode(s):
    return s.split(':')[-1].split('(')[0]
def featuredecodetable(result):
    x = pd.DataFrame(result['Feature'].unique())
    x.columns = ['Feature']
    x['featuredecode'] = x['Feature'].map(featuredecode)
    return x
decodetable = featuredecodetable(result)
result = pd.merge(result, decodetable, right_on='Feature', left_on='Feature', how='left')

In [None]:
ipynb.fs.full.postprocessing3_collect.result_bt(stg, fs, oversample, model, numberbt=10, suffix='', return_result=False)
result_boosttrap = pd.read_pickle('result_boosttrap.pkl')

In [None]:
def CI95(data):
    if len(data) == 1:
        return (np.nan, np.nan)
    return st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data)) #95% confidence interval

plot_data = result_boosttrap[['site', 'roc']].groupby("site").agg([np.mean, np.var, np.std, np.median, CI95]).reset_index()
plot_data.columns = [''.join(x) for x in plot_data.columns]
plot_data[['rocCI95down', 'rocCI95up']] = pd.DataFrame(plot_data['rocCI95'].tolist(), index=plot_data.index)
plot_data['rocCI95down'] = plot_data['rocmean'] - plot_data['rocCI95down']
plot_data['rocCI95up'] = plot_data['rocCI95up'] - plot_data['rocmean']
plot_data = plot_data.drop(['rocCI95'],axis=1)
plot_data = (plot_data>>mutate(Feature=X.site)>>mutate(auc=X.rocmean)>>mutate(aucdown=X.rocCI95down)>>mutate(aucup=X.rocCI95up)>>select('Feature','auc','aucdown','aucup')).drop_duplicates().groupby('Feature').mean().reset_index()
ax = plot_importance(plot_data, importance_type='auc', max_num_features = 10, error=True, importance_type_down = 'aucdown', importance_type_up = 'aucup')
result_boosttrap

In [None]:
#Show top freatures for each site year 
top_features(result, max_num_features=10)

In [None]:
ttN = top_features(result, max_num_features=10)
x = pd.get_dummies(ttN)
y = x.groupby(x.columns.str.split('_').str[1], axis=1).sum()    
y.dot(y.T)

In [None]:
#Get top of top features
maxnum_features=10
maxmax_feature=10

topnfeature = top_features(result, max_num_features=maxnum_features, importance_type='Importances')
numsiteyr = topnfeature.shape[0]
toptopfeatureN = topnfeature.melt()['value'].value_counts()
toptopfeatureN = toptopfeatureN[range(maxmax_feature)]
toptopfeature = pd.DataFrame(toptopfeatureN.keys())
toptopfeature.columns = ['featuredecode']
ttN = pd.DataFrame(toptopfeatureN).reset_index()

var_list = pd.read_pickle('spdf1.pkl')
opdf = pd.merge(ttN, var_list, left_on=['index'], right_on=['index'], how='left')
opdf[0] = np.sqrt(opdf[0])
opdf
#plt.scatter(x=opdf['value'],y=opdf[0])
#plt.show()

In [None]:
toptopfeature

In [None]:
#sites = ['MCRI', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
sites = ['MCRI', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'UTSW']

stg = 'stg01'
fs = 'rmscrbun'
oversample='raw'
model = 'catd'    
rmcol = '005'
model_file = pickle.load(open('data/'+'KUMC'+'/model_'+model+'_'+'KUMC'+'_'+str(3000)+'_'+stg+'_'+fs+'_'+oversample+'.pkl', 'rb'))

X_df = pd.read_pickle('data/'+'KUMC'+'/X_train_'+'KUMC'+'_'+str(3000)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
X_df = X_df[0:0]

for site in sites:
    X_test =  pd.read_pickle('data/'+site+'/X_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
    common_col = [x for x in X_df.columns if x in X_test.columns]
    X_test1 = X_df.copy()
    X_test1[common_col] = X_test[common_col]
    X_testbool = X_test1.select_dtypes('O').columns
    X_test1[X_testbool] = False
    y_test =  pd.read_pickle('data/'+site+'/y_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
    pred = model_file.predict_proba(X_test1)
    roc = roc_auc_score(y_test, pred[:,1])    
    print(site, roc)

In [None]:
#sites = ['MCRI', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
sites = ['MCRI', 'UIOWA', 'UNMC', 'UofU', 'UTHSCSA', 'KUMC', 'UTSW']

stg = 'stg01'
fs = 'rmscrbun'
oversample='raw'
model = 'catd'    
rmcol = '005'
model_file = pickle.load(open('data/'+'UMHC'+'/model_'+model+'_'+'UMHC'+'_'+str(3000)+'_'+stg+'_'+fs+'_'+oversample+'.pkl', 'rb'))

X_df = pd.read_pickle('data/'+'UMHC'+'/X_train_'+'UMHC'+'_'+str(3000)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
X_df = X_df[0:0]

for site in sites:
    X_test =  pd.read_pickle('data/'+site+'/X_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
    common_col = [x for x in X_df.columns if x in X_test.columns]
    X_test1 = X_df.copy()
    X_test1[common_col] = X_test[common_col]
    X_testbool = X_test1.select_dtypes('O').columns
    X_test1[X_testbool] = False
    y_test =  pd.read_pickle('data/'+site+'/y_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
    pred = model_file.predict_proba(X_test1)
    roc = roc_auc_score(y_test, pred[:,1])    
    print(site, roc)

In [None]:
# average auc per site
plotdata = result
plotdata = plotdata.astype({'year': 'str'})
plotdata = (plotdata>>mutate(Feature=X.site)>>select('Feature','auc')).drop_duplicates().groupby('Feature').mean().reset_index()
ax = plot_importance(plotdata, importance_type='auc', max_num_features = 10)

In [None]:
site = 'KUMC'
year = 3000
shap_data = result >> mask(X.site==site) >> mask(X.year==year)
#plot feature importance
importance_type = 'Importances'
#importance_type = 'minmax_SHAP'
#importance_type = 'varSHAP'
ax = plot_importance(shap_data, importance_type=importance_type, max_num_features = 10)
#ax.figure.savefig('data/'+site+'/model_'+site+'_'+str(year)+"_feature_"+importance_type+".png")

In [None]:
myfig, topf_n = top_n_SHAP(result, 'UMHC', 3000, importance_type=importance_type, max_num_features = 10, numgraphcol=5)

In [None]:
myfig = one_feature_SHAP_allyear_allsite(result, 'AGE', numgraphcol=5, ylim_range=[-0.8, 0.3])

In [None]:
myfig = one_feature_SHAP_allyear_allsite(result, 'LAB::33037-3(mmol/L)', numgraphcol=5, ylim_range=[-1.5, 1.5])

In [None]:
myfig = one_feature_SHAP_allyear_allsite(result, 'PX:CH:84300', numgraphcol=5)

In [None]:
result2 = result[result['Feature'] == 'AGE']
result2[['site', 'Importances']].drop_duplicates().groupby('site').mean().sort_values('Importances')

In [None]:
result2

In [None]:
one_feature_SHAP_allyear_allsite(result, '2823-3')

In [None]:
one_feature_SHAP(result, 'SYSTOLIC', 'MCRI')

In [None]:
from catboost import CatBoost, Pool
stg = 'stg01'
fs = 'rmscrbun'
oversample='raw'
model_type = 'catd'    
rmcol = '005'
site = 'MCRI'
year = '2011'
suffix=''
year=3000
model = pickle.load(open('data/'+site+'/model_'+model_type+'_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl', 'rb'))
X_train = pd.read_pickle('data/'+site+'/X_train_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.pkl')
y_train = pd.read_pickle('data/'+site+'/y_train_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.pkl')

In [None]:
is_cat = (X_train.dtypes == bool)
cat_features_index = np.where(is_cat)[0]

In [None]:
pool = Pool(X_train, y_train, cat_features=cat_features_index, feature_names=list(X_train.columns))

In [None]:
model.save_model('testtree.txt', format="json", export_parameters=None)

In [None]:
tfea = 'SYSTOLIC'

ageidx = np.where(np.array(model.feature_names_) == tfea)[0][0]

In [None]:
import json
f = open('testtree.txt')
tree = json.load(f)
f.close()

In [None]:
for i in range(len(tree['features_info']['float_features'])):
    if tree['features_info']['float_features'][i]['flat_feature_index'] == ageidx:
        print(i)
        ageidx2 = i

In [None]:
#model.plot_tree(tree_idx=38,pool=pool)

In [None]:
#tree['oblivious_trees'][5]
sp0 = X_train['SYSTOLIC']<93.33
sp1 = np.logical_and(X_train['SYSTOLIC']>=93.33, X_train['SYSTOLIC']<=109.25)
sp2 = X_train['SYSTOLIC']>109.25
spt = np.logical_not(np.isnan(X_train['SYSTOLIC']))
p0 = y_train[sp0].sum()/sp0.sum()
p1 = y_train[sp1].sum()/sp1.sum()
p2 = y_train[sp2].sum()/sp2.sum()
pt = y_train[spt].sum()/spt.sum()
print(p0, p1, p2, pt)

In [None]:
#tree['oblivious_trees'][5]
sp0 = X_train['SYSTOLIC']<93.33
sp1 = np.logical_and(X_train['SYSTOLIC']>=93.33, X_train['SYSTOLIC']<=108.25)
sp2 = X_train['SYSTOLIC']>108.25
spt = np.logical_not(np.isnan(X_train['SYSTOLIC']))
p0 = y_train[sp0].sum()/sp0.sum()
p1 = y_train[sp1].sum()/sp1.sum()
p2 = y_train[sp2].sum()/sp2.sum()
pt = y_train[spt].sum()/spt.sum()
print(p0, p1, p2, pt)

In [None]:
#tree['oblivious_trees'][5]
sp0 = X_train['SYSTOLIC']<93.33
sp1 = np.logical_and(X_train['SYSTOLIC']>=108.25, X_train['SYSTOLIC']<=110.25)
sp2 = X_train['SYSTOLIC']>108.25
spt = np.logical_not(np.isnan(X_train['SYSTOLIC']))
p0 = y_train[sp0].sum()/sp0.sum()
p1 = y_train[sp1].sum()/sp1.sum()
p2 = y_train[sp2].sum()/sp2.sum()
pt = y_train[spt].sum()/spt.sum()
print(p0, p1, p2, pt)


In [None]:
#myfig = zero_feature_SHAP(result, tfea, 'MCRI', int(year), vline=vline[:7], vlinelabel=vlinelabel[:7])
myfig = zero_feature_SHAP(result, tfea, 'MCRI', 3000, vline=vline[:7], vlinelabel=vlinelabel[:7])
print(list(zip(vline,vlinelabel))[:7])
myfig.savefig("SHAP_MCRI_2011_overelay2013.svg")

In [None]:
zero_feature_SHAP(result, tfea, 'UIOWA', int(year), vline=vline[:7], vlinelabel=vlinelabel[:7])
print(list(zip(vline,vlinelabel))[:7])

In [None]:
vline = []
vlinelabel = []
rank=0
for i in range(len(tree['oblivious_trees'])):
    for j in range(len(tree['oblivious_trees'][i]['splits'])):
        if 'float_feature_index' in tree['oblivious_trees'][i]['splits'][j].keys():
#            print(tree['oblivious_trees'][i]['splits'][j]['float_feature_index'])
            if tree['oblivious_trees'][i]['splits'][j]['float_feature_index'] == ageidx2:
                print(i, j, tree['oblivious_trees'][i]['splits'][j])
                vline.append(tree['oblivious_trees'][i]['splits'][j]['border'])
                vlinelabel.append(i)

In [None]:
result

In [None]:
def featuredecode(s):
    return s.split(':')[-1].split('(')[0]
def featuredecodetable(result):
    x = pd.DataFrame(result['Feature'].unique())
    x.columns = ['Feature']
    x['featuredecode'] = x['Feature'].map(featuredecode)
    return x
decodetable = featuredecodetable(result)
result = pd.merge(result, decodetable, right_on='Feature', left_on='Feature', how='left')

In [None]:
result[np.logical_and(result['isCategorical'], result['fval']==1)][['site','Feature','mean_val','valCI95down', 'valCI95up']].dropna()

In [None]:
def generate_forest_plot(result):
    import warnings
    warnings.filterwarnings('error')
    with warnings.catch_warnings():
        res3_list = dict()
#        dframe1 = result[result['isCategorical']][['site', 'Feature', 'fval', 'valmean_0', 'valstd_0', 'valsize_0', 'valmean_1', 'valstd_1', 'valsize_1']]       
        dframe1 = result[result['isCategorical']][['site', 'Feature', 'fval', 'valmean_0', 'valstd_0', 'valsize_0', 'valmean_1', 'valstd_1', 'valsize_1']]       
        filter_con = dframe1['valstd_0'] != 0
        filter_con = np.logical_and(dframe1['valstd_1'] != 0, filter_con)
        filter_con = np.logical_and(dframe1['valsize_0'] != 0, filter_con)
        filter_con = np.logical_and(dframe1['valsize_1'] != 0, filter_con)
        filter_con = np.logical_and(dframe1['fval'] != 0, filter_con)    
        dframe1 = dframe1[filter_con].dropna()
        for name, group in dframe1.groupby(['Feature']):
            if group.shape[0] != 1:
                try:
                    mean2, sd2, nobs2, mean1, sd1, nobs1 = np.asarray(group[['valmean_0', 'valstd_0', 'valsize_0', 'valmean_1', 'valstd_1', 'valsize_1']]).T
                    rownames  = group['site'].tolist()        
                    eff, var_eff = effectsize_smd(mean2, sd2, nobs2, mean1, sd1, nobs1)
                    res3 = combine_effects(eff, var_eff, method_re="chi2", use_t=True, row_names=rownames)
                    res3.conf_int_samples(nobs=np.array(nobs1 + nobs2))
                    #print(res3.summary_frame())
                    res3_list[name] = res3
                except:
                    print(name)
#    redf = pd.DataFrame([list(res3_list.keys()), [res3_list[key] for key in res3_list.keys()]]).T 
#    redf.columns = ['Feature', 'statmodel']
#    return redf
    return res3_list

In [None]:
res3_list = generate_forest_plot(result)

In [None]:
def forestplot_all_feature(res3_list, top_eff=10, reverse=False):    
#    res_fil = {key:res3_list[key] for key in res3_list.keys() if res3_list[key].summary_frame().loc['random effect']['ci_low']>0 or res3_list[key].summary_frame().loc['random effect']['ci_upp']<0}
    res_fil = {key:res3_list[key] for key in res3_list.keys()}
#    ci_low = [res_fil[key].summary_frame().loc['random effect']['ci_low'] for key in res_fil.keys()]
#    ci_upp = [res_fil[key].summary_frame().loc['random effect']['ci_upp'] for key in res_fil.keys()]
    res3_list_eff = {key:res_fil[key].summary_frame().loc['random effect']['eff'] for key in res_fil.keys()}
    max_key = sorted(res3_list_eff, key=res3_list_eff.get, reverse=reverse)[:top_eff]
    
    for key in max_key:
        print(key)
        res3_list[key].plot_forest()
        plt.savefig('allsite'+key.replace(":","_")+'.svg',bbox_inches='tight')        
        plt.show()


In [None]:
forestplot_all_feature(res3_list, top_eff=10, reverse=False)

In [None]:
forestplot_all_feature(res3_list, top_eff=10, reverse=True)