In [None]:
import importlib
import ipynb.fs.full.preprocessing0
import ipynb.fs.full.preprocessing1
import ipynb.fs.full.preprocessing2_BT
import ipynb.fs.full.preprocessing3_smote
import ipynb.fs.full.runxgboost
import ipynb.fs.full.postprocessing1_SHAP
import ipynb.fs.full.postprocessing3_collect

import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
import math
import operator

import inspect

In [None]:
models_type = ['catd', 'catcv', 'catr', 'xgbshg', 'xgbb']
sites = ['MCRI', 'IUR', 'MCW', 'UIOWA', 'UMHC', 'UNMC', 'UofU', 'UPITT', 'UTHSCSA', 'KUMC', 'UTSW']
stgs = ["stg01", "stg123"]
fss =  ['nofs', 'rmscrbun']
oversamples = ['raw', 'cp', 'sm']
rmcols = ['raw', '0.05']
ban_list = [('UPITT', 2013), ('UPITT', 2012), ('MCW', 2011)] #Sample size too small

In [None]:
#generate all combo and make it into dict
def generate_parameter_file(models_type, sites, stg, fss, oversamples, rmcols, ban_list):
    para_dicts = []
    paradir = "data/modelparameter"
    datafolder = '/home/hchan2/AKI/data/'
    home_directory = "/home/hchan2/AKI/AKI_Python/"
    print(sites)
    for s in sites:
        onset = pd.read_pickle('data/'+s+'/p0_onset_'+s+'.pkl')        
        years = list(pd.to_datetime(onset['ADMIT_DATE']).dt.year.unique())    
        para_dictsS = [{'model_type':m, 'site':s, 'year':y, 'stg':st, 'fs':fs, 'oversample':o, 'rmcol':r} for m in models_type for st in stg for fs in fss for o in oversamples for r in rmcols for y in years if (s, y) not in ban_list]
        para_dicts = para_dicts+para_dictsS
    return para_dicts

In [None]:
def generate_parameter_file_additional(para_list):
    for p in para_list:
        model_type, site, year, stg, fs, oversample, rmcol = operator.itemgetter('model_type', 'site', 'year', 'stg', 'fs', 'oversample', 'rmcol')(p)
        p['p0_onset'] = 'data/'+site+'/p0_onset_'+site+'.pkl'
        p['p0_vital'] = 'data/'+site+'/p0_vital_'+site+'.pkl'
        p['p0_demo']  = 'data/'+site+'/p0_demo_' +site+'.pkl'
        p['p0_dx']    = 'data/'+site+'/p0_dx_'   +site+'.pkl'
        p['p0_px']    = 'data/'+site+'/p0_px_'   +site+'.pkl'
        p['p0_lab']   = 'data/'+site+'/p0_lab_'  +site+'.pkl'
        p['p0_amed']  = 'data/'+site+'/p0_amed_' +site+'.pkl'
        
        p['onset']  = 'data/'+site+'/onset_' +site+'_'+str(year)+'.pkl'
        p['vital']  = 'data/'+site+'/vital_' +site+'_'+str(year)+'.pkl'
        p['demo']   = 'data/'+site+'/demo_'  +site+'_'+str(year)+'.pkl'
        p['dx']     = 'data/'+site+'/dx_'    +site+'_'+str(year)+'.pkl'
        p['px']     = 'data/'+site+'/px_'    +site+'_'+str(year)+'.pkl'
        p['labnum'] = 'data/'+site+'/labnum_'+site+'_'+str(year)+'.pkl'
        p['labcat'] = 'data/'+site+'/labcat_'+site+'_'+str(year)+'.pkl'
        p['amed']   = 'data/'+site+'/amed_'  +site+'_'+str(year)+'.pkl'

        p['bt'] = 'data/'+site+'/bt_'+site+'_'+str(year)+'.pkl'
        
        p['X_train'] = 'data/'+site+'/X_train_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'
        p['X_test'] =  'data/'+site+'/X_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'
        p['y_train'] = 'data/'+site+'/y_train_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'
        p['y_test'] =  'data/'+site+'/y_test_' +site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'

        p['model']       = 'data/'+site+'/model_'     +model_type+'_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'
        p['shapdata']    = 'data/'+site+'/shapdata_'  +model_type+'_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'.pkl'        
        p['parafile']    = 'modelparameter/parafile_' +model_type+'_'+site+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+'_'+rmcol+'.pkl'
        
    return para_list

In [None]:
def save_para_list(paralist):
    pd.DataFrame(paralist).to_pickle('paralist.pkl')
    for p in paralist:
        pickle.dump(p, open(p['parafile'], 'wb'))

In [None]:
if __name__ == "__main__":
    para_list = generate_parameter_file(models_type, sites, stg, fss, oversamples, rmcols, ban_list)
    para_list = generate_parameter_file_additional(para_list)
    save_para_list(para_list)

In [None]:
def generate_site_year(sites=None):
    pl = pd.read_pickle('paralist.pkl')
    sites = ['UTSW', 'KUMC']
    pl = pl.loc[:,['site' ,'year']].drop_duplicates()
    if sites is not None:
        return pl[pl['site'].isin(sites)].to_dict(orient='records')
    else:
        return pl.to_dict(orient='records')

In [None]:
def generate_site():
    pl = pd.read_pickle('paralist.pkl')
    sites = ['UTSW', 'KUMC']
    pl = pl.loc[:,'site'].drop_duplicates()
    return list(pl)

In [None]:
def generate_site_runner(runners, sites=None):
    if sites is None:
        sites = generate_site()  
    return [{'runner_wrapper':r, 'site':s} for r in runners for s in sites]

In [None]:
def generate_site_year_runner(runners, sites=None):
    sites_yr = generate_site_year(sites)    
    return [{'runner_wrapper':r, 'site':sy['site'], 'year':sy['year']} for r in runners for sy in sites_yr]

In [None]:
def generate_xgbcat_runner(runners, model_type=None, site=None, stg=None, fs=None, oversample=None, rmcol=None):
    sites_yr = generate_site_year(sites)    
    if model_type is None:
        model_type = ['catd']
    if stg is None:
        stg = ['stg01']
    if fs is None:
        fs = ['nofs', 'rmscrbun']
    if oversample is None:
        oversample = ['raw', 'cp']
    return [{'runner_wrapper':runners, 'site':sy['site'], 'year':sy['year'], 'stg':st, 'fs':f, 'oversample':o} for sy in sites_yr for m in model_type for st in stg for o in oversample for f in fs]