In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
import math
import matplotlib.pyplot as plt
from glob import glob

In [None]:
#Load statistics
stg = 'stg23'
#fs = 'rmscrbun'
fs = 'nofs'
oversample='raw'
model = 'catd'    
rmcol = '005'
year = '3000'

def prenetagam(stg, fs, oversample='raw', model='catd',rmcol='005',year=3000, maxnum_features=40, maxmax_feature=40):

    result = pd.read_pickle('../DEID_resultsplit_'+model+'_'+stg+'_'+str(year)+'_'+fs+'_'+oversample+'_005.pkl')

    #Get sites
    sites = list(result['site'].unique())

    #Get site year
    gb = result.loc[:,['site','year']].drop_duplicates().groupby('site')
    years = {}
    gbg = [gb.get_group(x) for x in gb.groups]
    for t in gbg:
            t = t.reset_index(drop=True)
            site = t.loc[0,'site']
            years[site] = list(t['year'])    

    #Get cat and num features
    cat_features = {}
    num_features = {}
    for s in sites:
        for y in years[s]:
            X_test =  pd.read_pickle('../data/'+s+ '/X_test_'+s+'_'+str(y)+'_'+stg+'_'+fs+'_'+oversample+'.pkl')
            cat_features[(s,y)] = list(X_test.select_dtypes('bool').columns)
            num_features[(s,y)] = [x for x in X_test.select_dtypes('bool').columns if x not in cat_features[(s,y)]]

    def featuredecode(s):
        return s.split(':')[-1].split('(')[0]
    def featuredecodetable(result):
        x = pd.DataFrame(result['Feature'].unique())
        x.columns = ['Feature']
        x['featuredecode'] = x['Feature'].map(featuredecode)
        return x
    decodetable = featuredecodetable(result)
    result = pd.merge(result, decodetable, right_on='Feature', left_on='Feature', how='left')

    #Get top features
    def top_features(shap_data, importance_type = 'Importances', max_num_features = 30):
    #    siteyr = shap_data['siteyr'].unique()
        siteyrlen = shap_data.loc[:,['site', 'year']].drop_duplicates().shape[0]
        #    years.sort()
        rank_table = shap_data.sort_values(['site', 'year', importance_type], ascending=False).loc[:,['site', 'year', 'featuredecode']].drop_duplicates().groupby(['site', 'year']).head(max_num_features).reset_index(drop=True)
        rank_table.loc[:, 'rank'] = list(range(1,max_num_features+1))*siteyrlen
        rank_table = rank_table.pivot(index=['site', 'year'], columns='rank', values='featuredecode')
        return rank_table   

    #topnfeature = top_features(result, max_num_features=30)

    #Get top of top features
    #maxnum_features=40
    #maxmax_feature=40
    #maxnum_features=10
    #maxmax_feature=3

    topnfeature = top_features(result, max_num_features=maxnum_features, importance_type='Importances')
    numsiteyr = topnfeature.shape[0]
    toptopfeatureN = topnfeature.melt()['value'].value_counts()
    toptopfeatureN = toptopfeatureN[range(maxmax_feature)]
    toptopfeature = pd.DataFrame(toptopfeatureN.keys())
    toptopfeature.columns = ['featuredecode']
#    pd.DataFrame(toptopfeatureN)

    # # Get top feature by median
    # maxmax_feature=10

    # mediantop = result[['siteyr', 'Feature', 'Importances']].drop_duplicates().groupby(['Feature']).median('Importances').nlargest(maxmax_feature, 'Importances')
    # mediantop = list(mediantop.index)
    # toptopfeature2 = pd.DataFrame([decodetable[decodetable['Feature'] == x].iloc[0,1] for x in mediantop])
    # toptopfeature2.columns = ['featuredecode']


    # toptopfeature = toptopfeature2
    # toptopfeature2

    result = result.drop(['valCI95_0', 'valCI95_1', 'absvalCI95_0', 'absvalCI95_1'],axis=1,errors='ignore')

    #Get all curves
    curves = pd.merge(result, toptopfeature, right_on='featuredecode', left_on='featuredecode', how='right')
    existscurve = curves.loc[:,['site', 'year', 'featuredecode']].drop_duplicates().groupby('featuredecode').count().reset_index()
    curves_new = curves.drop('Feature', axis=1)
    curves_new.to_parquet('metadata_'+stg+'_'+fs+'_.parquet', compression=None)

    curves_mean = curves_new.loc[curves_new['isCategorical']==False].loc[:,['site', 'year', 'featuredecode', 'mean_val']].groupby(['site', 'year', 'featuredecode']).mean().reset_index().rename(columns={'mean_val': 'mean_mean_val'})
    curves_new_mean = pd.merge(curves_new, curves_mean, left_on=['site', 'year', 'featuredecode'], right_on=['site', 'year', 'featuredecode'], how='left').assign(mean_val=lambda x: x.mean_val-x.mean_mean_val).drop('mean_mean_val',axis=1)
    curves_new_mean.to_parquet('metadata_mean_'+stg+'_'+fs+'_.parquet', compression=None)

    #Assemble raw data
    files = []
    #start_dir = os.getcwd()
    start_dir = "../data"
    pattern   = "shapdataraw*"
    for dir,_,_ in os.walk(start_dir):
        files.extend(glob(os.path.join(dir,pattern))) 
    #files = [x for x in files if model in x and stg in x and oversample in x and fs in x and rmcol in x]
    files = [x for x in files if model in x and stg in x and oversample in x and fs in x and rmcol in x and not ('drop' in x) and not ('BACKUP' in x) and '3000' in x]

    for file in files:
        print(file)    

    # %%time
    dfcollect = list()
    newfeaturecode = pd.DataFrame(curves_new_mean['featuredecode'].unique())
    newfeaturecode.columns = ['featuredecode']
    newdecodetable = pd.merge(newfeaturecode, decodetable, right_on='featuredecode', left_on='featuredecode', how='left')
    c = 0
    for f in files:
        c=c+1
        print(c,len(files))
        dft = pd.read_pickle(f)
        dft = pd.merge(newdecodetable, dft, right_on='Feature', left_on='Feature', how='left')    
        dft['site'] = f.split('_')[2]
        dft['year'] = f.split('_')[3]
        dft['siteyear'] = f.split('_')[2]+f.split('_')[3]
        dfcollect.append(dft)
    dfcollect = pd.concat(dfcollect)
    dfcollect.to_parquet('metadata_raw_'+stg+'_'+fs+'_.parquet', compression=None)

In [None]:
stgs = ["stg23"]
fss =  ['nofs']
for stg in stgs:
    for fs in fss:
        prenetagam(stg, fs, oversample='raw', model='catd',rmcol='005',year=3000, maxnum_features=40, maxmax_feature=40)

In [None]:
# import multiprocessing

# def worker(f):
#     print(f)
#     dft = pd.read_pickle(f)
#     dft = pd.merge(newdecodetable, dft, right_on='Feature', left_on='Feature', how='left')    
#     dft['site'] = f.split('_')[2]
#     dft['year'] = f.split('_')[3]
#     dft['siteyear'] = f.split('_')[2]+f.split('_')[3]
#     return dft

# dfcollect = list()
# newfeaturecode = pd.DataFrame(curves_new_mean['featuredecode'].unique())
# newfeaturecode.columns = ['featuredecode']
# newdecodetable = pd.merge(newfeaturecode, decodetable, right_on='featuredecode', left_on='featuredecode', how='left')

# pool = multiprocessing.Pool(processes = 16)
# dfcollect = pool.map(worker, files)
# dfcollect = pd.concat(dfcollect)
# if ckd_group != 0:
#     dfcollect = dfcollect[dfcollect['ckd_group'] == ckd_group]
# dfcollect.to_parquet('metadata_raw_'+str(ckd_group)+'.parquet', compression=None)

In [None]:
#dfcollect = pd.read_parquet('metadata_raw.parquet')

In [None]:
#dfcollect[dfcollect['Feature']=='LAB::2075-0(OT)']

In [None]:
# #Check Curve Domain
# domain_check = curves_new_mean.loc[curves_new_mean['isCategorical']==False].loc[:,['site', 'year', 'featuredecode', 'fval']].groupby(['site', 'year', 'featuredecode']).agg(['min','max']).reset_index()
# domain_check.columns = ['site', 'year', 'featuredecode', 'fvalmin', 'fvalmax']
# domain_check = domain_check.astype({'year':'str'}).assign(siteyear=lambda x: x.site + x.year)
# domain_checkX = domain_check.loc[domain_check['featuredecode']=='2823-3']

In [None]:
# plt.bar(domain_checkX['siteyear'], domain_checkX['fvalmin'])

In [None]:
# plt.bar(domain_checkX['siteyear'], domain_checkX['fvalmax'])

# chk1=pd.read_pickle('/home/hchan2/AKI/AKI_Python/data/KUMC/shapdataraw_catd_KUMC_3000_stg01_rmscrbun_raw_005.pkl')
# chk2=pd.read_pickle('/home/hchan2/AKI/AKI_Python/data/KUMC/shapdata_catd_KUMC_3000_stg01_rmscrbun_raw_005.pkl')

In [None]:
#chk2.columns

## 