In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
from glob import glob

import plot_utils
import utils_function

In [None]:
def collect(stg='', model='',site = '', year='', oversample='', fs='', rmcol='005', return_result=False):
    print('Running collect on all site', flush = True)
    
    #Get all shap result files
    files = []
    start_dir = "/home/hoyinchan/blue/Data/data2021/data2021/"
    pattern   = "shapdata_*"
    for dir,_,_ in os.walk(start_dir):
        files.extend(glob(os.path.join(dir,pattern))) 

    files = [x for x in files if model in x and stg in x and site in x and str(year) in x and oversample in x and fs in x and rmcol in x and not ('drop' in x) and not ('BACKUP' in x)]
    
    #Read all into dataframe
    shap_tables = []
    for file in files:
        print(file)        
        shap_tables.append(pd.read_pickle(file))

    #concatanate
    result = pd.concat(shap_tables, ignore_index=True)

    if return_result:
        return result
    
    #save to file
    result.to_pickle('result.pkl')
    result.to_csv('result.csv', index=False)

    print('Finished collect on all site', flush = True)
    

In [None]:
def result_strp(filename, cat=True):
    result = pd.read_pickle(filename)
    if cat:
        result = result.loc[:,['fval', 'mean_val', 'se_val', 'Feature', 'Importances', 'rank', 'site', 'year', 'stg', 'fs', 'oversample', 'model', 'rmcol', 'auc', 'isCategorical']]
    else:
#        result = result.loc[:,['fval', 'mean_val', 'se_val', 'Feature', 'Importances', 'rank', 'site', 'year', 'stg', 'fs', 'oversample', 'model', 'rmcol', 'auc', 'isCategorical']]        
        pass
    result.to_pickle('result_strp.pkl')
    result = result.drop(['rank', 'stg', 'rmcol', 'model'],axis=1)
    result.to_pickle('result_strp3.pkl')

In [None]:
def result_split(model, stg='stg01', site = '', year='', oversample='raw', fs='rmscrbun', rmcol='005', return_result=False):
    t = collect(stg=stg, model=model,site =site, year=year, oversample=oversample, fs=fs, rmcol='005', return_result=True)
    if return_result:
        return t
    else:
        filestring = ('resultsplit_'+model+"_"+stg+"_"+site+"_"+year+"_"+fs+"_"+oversample+"_"+rmcol+'.pkl').replace("_.pkl", ".pkl").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_")   
        print(filestring)
        t.to_pickle("/home/hoyinchan/blue/Data/data2021/data2021/"+filestring)

In [None]:
def DEID(model, stg='stg01', site = '', year='', oversample='raw', fs='rmscrbun', rmcol='005', return_result=False):
#    shap_data = pd.read_pickle('result.pkl')
    result_split(model, stg, site, year, oversample, fs, rmcol, return_result=False)
    filestring = ('resultsplit_'+model+"_"+stg+"_"+site+"_"+year+"_"+fs+"_"+oversample+"_"+rmcol+'.pkl').replace("_.pkl", ".pkl").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_").replace("__", "_")   
    shap_data = pd.read_pickle("/home/hoyinchan/blue/Data/data2021/data2021/"+filestring)
    mask1 = shap_data['Feature'] != 'AGE'
    mask2 = shap_data['fval'] < 90
    shap_data = shap_data.loc[mask1 | mask2]
    #save to file
    shap_data.to_pickle("/home/hoyinchan/blue/Data/data2021/data2021/"+'DEID_'+filestring)
#    shap_data.to_pickle('DEID_result.pkl')
#    shap_data.to_csv('DEID_result.csv', index=False)

In [None]:
def result_bt(stg='', fs='', oversample='', model_type='', numberbt=10, suffix='', return_result=False, site = '', year = '', rmcol=''):

    files = []
    start_dir = os.getcwd()
    pattern   = "boosttrap_*"
    for dir,_,_ in os.walk(start_dir):
        files.extend(glob(os.path.join(dir,pattern))) 

    files = [x for x in files if model_type in x and stg in x and site in x and str(year) in x and oversample in x and fs in x and rmcol in x and not ('drop' in x) and not ('BACKUP' in x)]
    
    shap_tables = []
    for file in files:
        print(file)        
        with open(file, 'rb') as f:
            x = pickle.load(f)
            shap_tables.append(pd.read_pickle(file))
        
    #concatanate
    result = pd.DataFrame(shap_tables, columns =['site', 'year', 'stg', 'fs', 'oversample', 'model', 'numberbt', 'modelobj', 'roc', 'cm'])

    if return_result:
        return result
    
    result.to_pickle('result_boosttrap.pkl')

In [None]:
def collect_collectSHAPraw_cross_sub(configs_variables):   
    
    if not configs_variables[0]['rerun_flag'] and os.path.exists(datafolder+'/shapalltmp.parquet'):
        print('Existed: shapalltmp.parquet')
        return

    shap_data_raws = list()
    for configs_variable_m in configs_variables:
         for configs_variable_d in configs_variables:
            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''            
            tmpdf = pd.read_parquet(datafolder+site_m+'/shapdataraw1d_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet')
            tmpdf['site_m'] = 'site_m'
            tmpdf['site_d'] = 'site_d'
            shap_data_raws.append(tmpdf)

    shap_data_raws = pd.concat(shap_data_raws)
    shap_data_raws.to_parquet(datafolder+'/shapdataraw1d.parquet')            
    
    shap_data_raws = list()
    for configs_variable_m in configs_variables:
         for configs_variable_d in configs_variables:
            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''            
            tmpdf = pd.read_parquet(datafolder+site_m+'/shapdataraw2d_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet')
            tmpdf['site_m'] = 'site_m'
            tmpdf['site_d'] = 'site_d'
            shap_data_raws.append(tmpdf)

    shap_data_raws = pd.concat(shap_data_raws)
    shap_data_raws.to_parquet(datafolder+'/shapdataraw2d.parquet')    

In [None]:
def collect_collectSHAPraw_cross_sub_pre(configs_variables, top0=30):
    
    # get top features
    df_importances, df_importances_stat = plot_utils.get_importances_features_stat(configs_variables)
    df = df_importances.sort_values('rank', ascending=False).reset_index().groupby('site').head(top0)
    top3030 = df[['site', 'Feature Id']].groupby('Feature Id').count().sort_values('site',ascending=False).head(top0)  
    top3030 =top3030.index

    shap_finals = list()

    for configs_variable_m in configs_variables:
        for configs_variable_d in configs_variables:
            # read datas
            year=3000
            site_m, datafolder, home_directory = utils_function.get_commons(configs_variable_m)
            site_d, datafolder, home_directory = utils_function.get_commons(configs_variable_d)

            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''     

            if not configs_variable_m['rerun_flag'] and os.path.exists(datafolder+'/shapalltmp.parquet'):
                print('Existed: shapalltmp.parquet')

            print('Running collectSHAPraw_cross_sub '+model_type+' on site '+site_m+'/'+site_d+":"+str(year)+":"+stg+":"+fs+":"+oversample, flush = True)
            tic = time.perf_counter()     

            try:
                columc_df = pd.read_pickle(datafolder+site_m+'/X_train_'+site_m+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.pkl')
                feature_exists = list(set(columc_df.columns) & set(top3030))
                
                shapX = pd.read_parquet(datafolder+site_m+'/shapdatarawX_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet', columns=feature_exists)
                shap = pd.read_parquet(datafolder+site_m+'/shapdataraw_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet', columns=feature_exists)            
                
                shapX = shapX[feature_exists]
                shap = shap[feature_exists]

#                 # Reset index to convert the index to a column
#                 shapX_reset = shapX.reset_index()
#                 shapX_long = pd.melt(shapX_reset, id_vars=['index'], var_name='feature', value_name='value')
#                 shapX_long = shapX_long.rename(columns={'index': 'ID'})
#                 shapX_long.columns = ['ID', 'feature', 'Name']

#                 # Reset index to convert the index to a column
#                 shap_reset = shap.reset_index()
#                 shap_long = pd.melt(shap_reset, id_vars=['index'], var_name='feature', value_name='value')
#                 shap_long = shap_long.rename(columns={'index': 'ID'})

#                 shap_final = shap_long.merge(shapX_long, on = ['ID', 'feature'], how='inner')
                
                shapX.columns = shapX.columns+'_Names'
                shap.columns = shap.columns+'_vals'
                shap_final = pd.concat([shapX, shap],axis=1)    
    
                shap_final['site_m'] = site_m
                shap_final['site_d'] = site_d

                shap_finals.append(shap_final)
            except:
                pass
        
    shap_finalX = pd.concat(shap_finals)
    shap_finalX.to_parquet(datafolder+'/shapalltmp.parquet')