In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
from glob import glob

In [None]:
def collect(stg='', model='',site = '', year='', oversample='', fs='', rmcol='005', return_result=False):
    print('Running collect on all site', flush = True)
    
    #Get all shap result files
    files = []
    start_dir = os.getcwd()
    pattern   = "shapdata_*"
    for dir,_,_ in os.walk(start_dir):
        files.extend(glob(os.path.join(dir,pattern))) 

    files = [x for x in files if model in x and stg in x and site in x and str(year) in x and oversample in x and fs in x and rmcol in x and not ('drop' in x) and not ('BACKUP' in x)]
    
    #Read all into dataframe
    shap_tables = []
    for file in files:
        print(file)        
        shap_tables.append(pd.read_pickle(file))

    #concatanate
    result = pd.concat(shap_tables, ignore_index=True)

    if return_result:
        return result
    
    #save to file
    result.to_pickle('result.pkl')
    result.to_csv('result.csv', index=False)

    print('Finished collect on all site', flush = True)
    

In [None]:
def result_strp(filename, cat=True):
    result = pd.read_pickle(filename)
    if cat:
        result = result.loc[:,['fval', 'mean_val', 'se_val', 'Feature', 'Importances', 'rank', 'site', 'year', 'stg', 'fs', 'oversample', 'model', 'rmcol', 'auc', 'isCategorical']]
    else:
#        result = result.loc[:,['fval', 'mean_val', 'se_val', 'Feature', 'Importances', 'rank', 'site', 'year', 'stg', 'fs', 'oversample', 'model', 'rmcol', 'auc', 'isCategorical']]        
        pass
    result.to_pickle('result_strp.pkl')
    result = result.drop(['rank', 'stg', 'rmcol', 'model'],axis=1)
    result.to_pickle('result_strp3.pkl')

In [None]:
def result_split(model, stg='stg01', site = '', year='', oversample='raw', fs='rmscrbun', rmcol='005', return_result=False):
    t = collect(stg=stg, model=model,site =site, year=year, oversample=oversample, fs=fs, rmcol='005', return_result=True)
    if return_result:
        return t
    else:
        filestring = ('resultsplit_'+model+"_"+stg+"_"+site+"_"+year+"_"+fs+"_"+oversample+"_"+rmcol+'.pkl').replace("_.pkl", ".pkl").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_")   
        print(filestring)
        t.to_pickle(filestring)

In [None]:
def DEID(model, stg='stg01', site = '', year='', oversample='raw', fs='rmscrbun', rmcol='005', return_result=False):
#    shap_data = pd.read_pickle('result.pkl')
    result_split(model, stg, site, year, oversample, fs, rmcol, return_result=False)
    filestring = ('resultsplit_'+model+"_"+stg+"_"+site+"_"+year+"_"+fs+"_"+oversample+"_"+rmcol+'.pkl').replace("_.pkl", ".pkl").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_")   
    shap_data = pd.read_pickle(filestring)
    mask1 = shap_data['Feature'] != 'AGE'
    mask2 = shap_data['fval'] < 90
    shap_data = shap_data.loc[mask1 | mask2]
    #save to file
    shap_data.to_pickle('DEID_'+filestring)
#    shap_data.to_pickle('DEID_result.pkl')
#    shap_data.to_csv('DEID_result.csv', index=False)

In [None]:
def result_bt(stg='', fs='', oversample='', model_type='', numberbt=10, suffix='', return_result=False, site = '', year = '', rmcol=''):

    files = []
    start_dir = os.getcwd()
    pattern   = "boosttrap_*"
    for dir,_,_ in os.walk(start_dir):
        files.extend(glob(os.path.join(dir,pattern))) 

    files = [x for x in files if model_type in x and stg in x and site in x and str(year) in x and oversample in x and fs in x and rmcol in x and not ('drop' in x) and not ('BACKUP' in x)]
    
    shap_tables = []
    for file in files:
        print(file)        
        with open(file, 'rb') as f:
            x = pickle.load(f)
            shap_tables.append(pd.read_pickle(file))
        
    #concatanate
    result = pd.DataFrame(shap_tables, columns =['site', 'year', 'stg', 'fs', 'oversample', 'model', 'numberbt', 'modelobj', 'roc', 'cm'])

    if return_result:
        return result
    
    result.to_pickle('result_boosttrap.pkl')