In [27]:
import pandas as pd
import numpy as np
import os
from tqdm import notebook
from joblib import Parallel, delayed
from collections import Counter
import time
from sklearn.model_selection import StratifiedShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

## Version as of 2021-1-1

#csv_dir = '/home/ghhur/data/input' # '../ghhur_data_input/' in old system
#input_dir = '/home/ghhur/data/cohort'
#output_dir = '/home/ghhur/data/1_processed'


csv_dir = '/home/ghhur/data/csv' # '../ghhur_data_input/' in old system
input_dir = '/home/ghhur/data/input'
output_dir = '/home/ghhur/data/output'

In [2]:
def dirmake(path):
    if not os.path.isdir(path):
        os.mkdir(path)
    elif os.path.isdir(path):
        print('path {} already exist!'.format(path))

In [3]:
#csv_dir check
dirmake(csv_dir)
dirmake(os.path.join(csv_dir,'mimic'))
dirmake(os.path.join(csv_dir,'eicu'))
mimic_check = os.path.isfile(os.path.join(csv_dir, 'mimic_cohort_dx_done.pk'))
eicu_check = os.path.isfile(os.path.join(csv_dir, 'eicu_cohort_dx_done.pk'))
if mimic_check and eicu_check:
    print('cohort file is opened!')
    
else:
    print('cohort file location is wrong, please put mimic & eicu cohort file on csv directory')
    raise NotImplementedError

path /home/ghhur/data/csv already exist!
path /home/ghhur/data/csv/mimic already exist!
path /home/ghhur/data/csv/eicu already exist!
cohort file is opened!


In [4]:
print('current working directory is...  "{}"'.format(os.getcwd()))
#output dir generate
task_list = ['readmission', 'mortality', 'los_3days', 'los_7days', 'dx_dpeth1_unique']
model_type_list = ['singleRNN', 'cls_learnable']
source_file_list = ['mimic', 'eicu', 'both']
dirmake(output_dir)
dirmake(os.path.join(output_dir, 'all'))
for model_type in model_type_list:
    tmp_dir_model = os.path.join(output_dir, 'all', model_type)
    dirmake(tmp_dir_model)
    for source_file in source_file_list:
            tmp_dir_source_file = os.path.join(tmp_dir_model,source_file)
            dirmake(tmp_dir_source_file)
            for task in task_list:
                tmp_dir_task = os.path.join(tmp_dir_source_file, task)
                dirmake(tmp_dir_task)
print('Making output directory finish!')
#input dir generate
input_dir = './input'
dirmake(input_dir)
dirmake(os.path.join(input_dir, 'all'))
dirmake(os.path.join(input_dir, 'all', 'embed_vocab_file'))
print('Making input directory finish!')

current working directory is...  "/home/ghhur/preprocess_code"
Making output directory finish!
path ./input already exist!
path ./input/all already exist!
path ./input/all/embed_vocab_file already exist!
Making input directory finish!


In [5]:
sources = ['mimic','eicu']
items= ['lab','med','inf'] # items = ['dx','lab','med','trt','chart','inf']

mimic_csv_files = {'lab':['LABEVENTS'], 'med':['PRESCRIPTIONS'],  # mimic dictionary
                        'inf': ['INPUTEVENTS_CV', 'INPUTEVENTS_MV']} 
eicu_csv_files = {'lab':['lab'], 'med':['medication'],'inf':['infusionDrug']}
                  # eicu dictionary
                 

mimic_dictionary_file = {'LABEVENTS':'D_LABITEMS', 
                         'INPUTEVENTS_CV':'D_ITEMS', 'INPUTEVENTS_MV':'D_ITEMS'}
#eicu chartevent 없음

In [6]:
mimic_columns_map = {'LABEVENTS':
                         {'HADM_ID':'ID','CHARTTIME':'order_time','ITEMID':'code_name',
                          'FLAG':'issue'},
                     'PRESCRIPTIONS':
                         {'HADM_ID':'ID','STARTDATE':'order_time',
                          'DRUG':'code_name', 'ROUTE':'route', 'PROD_STRENGTH':'prod'},                                      
                      'INPUTEVENTS_CV': 
                         {'HADM_ID':'ID','CHARTTIME':'order_time', 
                          'ITEMID':'code_name','AMOUNTUOM':'value_uom', 'RATEUOM':'rateuom',
                          'STOPPED':'issue'},
                      'INPUTEVENTS_MV': 
                         {'HADM_ID':'ID', 'STARTTIME':'order_time','AMOUNTUOM':'value_uom', 'RATEUOM':'rateuom',
                          'ITEMID':'code_name', 'STOPPED':'issue'}
                    }

In [7]:
eicu_columns_map =  {'lab':
                         {'patientunitstayid':'ID', 'labresultoffset':'order_offset','labname':'code_name'},
                     'medication':
                         {'patientunitstayid':'ID','drugstartoffset':'order_offset','drugname':'code_name', 'routeadmin':'route',
                          'ordercancelled':'issue'},      
                      'infusionDrug':
                         {'patientunitstayid':'ID','infusionoffset':'order_offset', 'drugname':'code_name'}
                    }

In [8]:
issue_map = {'LABEVENTS': ['abnormal'],                            
             'INPUTEVENTS_CV':['Restart','NotStopd'] ,
             'INPUTEVENTS_MV': ['Rewritten', 'Changed', 'Paused', 'Flushed', 'Stopped'],
             'medication': ['Yes'],          
            }

# Preprocessing for files

# Preprocess_1st

In [9]:
'''
Input : argument -> output : df, cohort
processing:
column_rename-> cohort_filtering -> issue_delete -> if mimic, name_dict -> df, cohort
'''

'\nInput : argument -> output : df, cohort\nprocessing:\ncolumn_rename-> cohort_filtering -> issue_delete -> if mimic, name_dict -> df, cohort\n'

In [10]:
class data_init():
    def __init__(self, file:str, src:str, item:str, columns_map):
        self.columns_map = columns_map
        self.file = file
        self.src = src
        self.item = item
        self.input_folder = os.path.join(csv_dir,src)
        df_path = os.path.join(self.input_folder,file+'.csv')
        cohort_path = os.path.join(input_dir, src+'_cohort_dx_done.pk')
        # Read in cohort pickle and appropriate csv file
        self.cohort = pd.read_pickle(cohort_path).reset_index(drop=True)
        print('cohort load finish!')
        self.df = pd.read_csv(df_path)
        print('csv file load finish!')
        if self.src == 'mimic':
            self.cohort=self.cohort.rename({'HADM_ID':'ID'},axis='columns') # rename ID columns in pickle as needed
        elif self.src == 'eicu':
             self.cohort=self.cohort.rename({'patientunitstayid':'ID'},axis='columns')
        
    def column_rename(self, df):      # for similar col names
        df = df.rename(self.columns_map[self.file], axis='columns')
        return df                          
        
    def cohort_filtering(self, df):    # take only the observations from the .csv file which are in the cohort
        df_id=df['ID']
        cohort_id=self.cohort['ID']
        df = df[df_id.isin(cohort_id)].reset_index(drop=True)
        
        cohort = self.cohort[cohort_id.isin(df_id)].reset_index(drop=True) # drop cohort obs which !in(csv)
    #    df.replace(' ', '_', regex=True, inplace=True) <-- earlier preprocessing code?
       
        if 'ICUSTAY_ID' in df.columns: # drop all with missing ICUSTAY_ID (mimic)
            df = df.loc[df['ICUSTAY_ID'].isnull()==False].reset_index(drop=True)
        if 'start_time' in df.columns: # if null end_time, input start_time (from e.g. mimic prescriptions)
            indexes = df[df['end_time'].isnull()==True]['start_time'].index
            df['end_time'][indexes] = df[df['end_time'].isnull()==True]['start_time']
        df.fillna('null', inplace=True)
        return df, cohort
    
    
    def issue_delete(self, df): # e.g. drop if 'order_cancelled'==yes in eicu medications
        if 'issue' in df.columns:
            issue_label = issue_map[self.file]
            df.drop(df[df['issue'].isin(issue_label)].index, inplace=True)
        return df
    
    def name_dict(self, df): # for mimic files, create code_name from appropriate dictionary file (key e.g. ICD9)
        if self.file in mimic_dictionary_file:
            dict_name=mimic_dictionary_file[self.file]
            dict_path = os.path.join(self.input_folder, dict_name+'.csv')
            code_dict = pd.read_csv(dict_path)
            if dict_name in ['D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES']:
                key = code_dict['ICD9_CODE']
                value = code_dict['LONG_TITLE']
            else:
                key = code_dict['ITEMID']
                value = code_dict['LABEL']
            code_dict = dict(zip(key,value))
            df['code_name'] = df['code_name'].map(code_dict)
        return df          
                
    def __call__(self):        
        print('column_rename start!')
        df = self.column_rename(self.df)
        print('column_rename finish!')
        print('cohort_filtering start!')
        df, cohort = self.cohort_filtering(df)
        print('cohort_filtering finish!')
        print('issue_delete start!')
        df = self.issue_delete(df)
        print('issue_delete finish!')
        if self.src == 'mimic':
            print('name_dict start!')
            df = self.name_dict(df)
            print('name_dict finish!')
        return df, cohort

In [11]:
class list_preparation():
    def __init__(self, df:pd.DataFrame, cohort:pd.DataFrame,  src:str, code:str, file:str, columns_map):
        self.columns_map = columns_map
        self.file = file
        self.df = df.reset_index(drop=True) # our filtered df, now with med names if needed
        self.cohort = cohort
        self.src = src
        if src =='mimic':
            self.INTIME = pd.to_datetime(self.cohort['INTIME']) # for manipulation later
            self.OUTTIME = pd.to_datetime(self.cohort['OUTTIME']) 
#####################################################################################################   
    def generate_offset(self, item_list:list, df):
        '''
        Input : time -> output : offset
        (order_time - INTIME) and make time as min
        
        time_list : example [order_time, start_time, end_time ]
        offset_list : ex [order_offset, start_offset, end_offset]
        '''
        self.cohort = self.cohort.reset_index(drop=True)
        time_list = ['{}_time'.format(item) for item in item_list] 
        offset_list = ['{}_offset'.format(item) for item in item_list] # names of cols
        offset_dict = {}
        index_dict = {}
        #Empty dict for offset, empty list for index
        for idx, item_time in enumerate(time_list):    # time_list = [start_time, end_time] e.g. for PRESCRIPTIONS
            df[item_time] = pd.to_datetime(df[item_time])
            offset_dict[offset_list[idx]]=[]
            index_list =[]
        # from cohort ID, take INTIME
        for row, ID in enumerate(notebook.tqdm(self.cohort['ID'])): # w/ progress bar
            one_id_rows = df[df['ID']==ID]        
            index_list.extend(list(one_id_rows.index)) # add index of one_id_rows
            for idx, item_time in enumerate(time_list):
                offset_series=one_id_rows[item_time].apply(lambda x: round((x-self.INTIME[row]).total_seconds()/60))               
                offset_dict[offset_list[idx]].extend(list(offset_series))
        
        df= pd.concat([df, pd.DataFrame(offset_dict, index=index_list, columns=offset_list)], axis=1).reset_index(drop=True)
        
        return df        
    
    def time_filtering(self, item, df):
        ''' 
        Input : time -> output : timne
        time filtering by INTIME , OUTTIME
        '''
        series = pd.Series()
        for row, ID in enumerate(notebook.tqdm(self.cohort['ID'])):
            timestamp_in = self.INTIME[row]
            timestamp_out = self.OUTTIME[row]
            times = df.loc[df['ID'] == ID][item]  # from <  for item in ['start_time','end_time'] >
            series = series.append((timestamp_in <= times) & (times <= timestamp_out)) # T/F
        df = df.loc[series].reset_index(drop=True)
        
        return df
############################################################################################################
    
    def charttime_offset(self, df):
        if 'start_time' in df.columns:
            df['order_time'] = pd.to_datetime(df['start_time']) #start_time 을 order_time으로
            df = self.generate_offset(['order', 'start', 'end'], df)
        
        elif 'order_time' in df.columns:    
            df = self.generate_offset(['order'], df)        
        
        return df.reset_index(drop=True)    
       
   
    def time_filter(self, df):   
        if self.src == 'mimic':
            if 'start_time' in df.columns:
                for item in ['start_time','end_time']:
                    df[item] = pd.to_datetime(df[item])
                    df = self.time_filtering(item, df)
            elif 'order_time' in df.columns:
                    df['order_time'] = pd.to_datetime(df['order_time'])
                    df = self.time_filtering('order_time', df)          
    
        elif self.src == 'eicu': 
            if 'order_offset' in df.columns:
                df = df.loc[df['order_offset'] >= 0].reset_index(drop=True)   
            elif 'start_offset' in df.columns:
                #start_offset -> order_offset copy
                df = df.loc[df['start_offset'] >= 0].reset_index(drop=True)
                df = df.loc[df['end_offset'] >= df['start_offset']].reset_index(drop=True)
                df['order_offset'] = df['start_offset']
                  
        
        return df.reset_index(drop=True)    

    def list_make_sort(self, df):
            cohort = self.cohort.reset_index(drop=True)  
            columns = self.columns_map[self.file]
            columns_names = [value for key, value in columns.items() if value not in ['order_offset','ID','order_time','issue']]
            columns_names.append('order_offset') # take .csv file columns + order_offset
            columns_dict = {}
            for column in columns_names:
                columns_dict[column]=[]# dictionary w/ column names as keys
                if df[column].dtype =='O':
                    df[column] = df[column].str.lower()
            for row, ID in enumerate(notebook.tqdm(self.cohort['ID'])):
                one_id_rows = df.loc[df['ID']==ID] # take each ID
                sort_by_offset = one_id_rows.sort_values(by='order_offset', ascending=True) # sort events by time since admission
                for column in columns_names:
                    columns_dict[column].append(list(sort_by_offset[column].values)) # append these as value list to key
                
            for column in columns_names:
                list_column=columns_dict[column] # take the key:value pairs one by one 
                series_list = pd.Series(list_column) # make it a series 
                cohort[column] = series_list # append it to cohort dataframe as a new column w/ same name as in df
            
            return cohort
    
    def __call__(self):        
        print('time_filter start!')
        df = self.time_filter(self.df)
        print('time_filter finished!')
        if self.src == 'mimic':
            print('charttime_offset start!')
            df = self.charttime_offset(df)   
            print('charttime_offset finished!')
        print('list_make_sort start!')  
        cohort = self.list_make_sort(df)
        print('list_make_sort finished!')
        return cohort

# Medication align & INPUTEVENTS Merge

In [12]:
### 1: Concat mimic med
#mimic : code_name, prod, value, value_uom, route
def medication_align(src, med):
    mimic_list = ['code_name', 'prod', 'route']
    eicu_list = ['code_name', 'route']
    if src == 'mimic':
        for column in mimic_list:
            med_list = []
            for column_list in med[column]: 
                revised_column_list = ['[UNK]' if x=='null' else x for x in column_list]
                med_list.append(revised_column_list)
            med[column] = pd.Series(med_list)
        med = med.rename(columns={'code_name':'code_name_old'})
        med['code_name'] = pd.Series([[i+' '+j+' '+k for i,j,k in zip(med['code_name_old'].iloc[z], \
                                                                             med['prod'].iloc[z], \
                                                                             med['route'].iloc[z])] \
                                       for z in range(len(med.index))])
    elif src =='eicu':
        for column in eicu_list:
            med_list = []
        for column_list in med[column]: 
            revised_column_list = ['[UNK]' if x=='null' else x for x in column_list]
            med_list.append(revised_column_list)
        med[column] = pd.Series(med_list)
        med = med.rename(columns={'code_name':'code_name_old'})
        med['code_name'] = pd.Series([[i+' '+j for i,j in zip(med['code_name_old'].iloc[z], \
                                                                         med['route'].iloc[z])] \
                                    for z in range(len(med.index))])

   
    med = med.drop('code_name_old', axis=1)
    print('Finished mimic med code_name/amount/units concatenation.')
    return med

In [None]:
### Merge INPUTEVENTS _CV and _MV
#exist_output = os.listdir(output_dir) # what's in our output directory?
#print('exist_files!',exist_output)
def Merge_INPUTEVENTS(ie_cv, ie_mv):  
    #if file_name_merged not in exist_output:
    print('Beginning INPUTEVENTS CV-MV merge...')
    print('ie_cv length', len(ie_cv))
    print('ie_mv length', len(ie_mv)) 
    concat = pd.concat([ie_cv, ie_mv]).reset_index(drop=True)
    print('Files concatenated.')
    print('merged length', len(concat))
    
    # General practice for concatenating: check columns
    not_intersection_mv = [col for col in concat.columns if col not in ie_mv.columns]
    not_intersection_cv = [col for col in concat.columns if col not in ie_cv.columns]

    if not_intersection_mv != [] or not_intersection_cv != []:
        print('Not all columns in new file shared between original files.')
        print(' Cols not in CV: ', not_intersection_cv, '\n Cols not in MV: ', not_intersection_mv)

    # Uncomment if part (3) is not being run    
    # concat.to_pickle(os.path.join(output_dir, 'mimic_inf_INPUTEVENTS_merged_init.pkl'))
    # print('Output file: < mimic_inf_INPUTEVENTS_merged_init.pkl > to ', output_dir)

    print('Finished INPUTEVENTS CV-MV concatenation.')



    ### 3: Process INPUTEVENTS from MIMIC

    start = time.time()
    print('Beginning INPUTEVENTS_merged code_name processing...')

    concat['code_name_new'] = pd.Series([[(str(i)+' ' \
                                               +('('+str(j)+')').replace('(null)','')+' '\
                                               +('('+str(k)+')').replace('(null)','')).replace('  ',' ').strip() \
                                  for i,j,k in zip(concat['code_name'].iloc[z], \
                                                  concat['value_uom'].iloc[z], \
                                                  concat['rateuom'].iloc[z])] \
                                   for z in range(len(concat.index))])
    concat = concat.drop('code_name', axis=1)
    concat = concat.rename(columns={'code_name_new':'code_name'})
    # for nan 
    inf_code_list = []
    for code_list in concat['code_name']: 
        revised_code_list = ['[UNK]' if x[0:3]=='nan' else x for x in code_list]
        inf_code_list.append(revised_code_list)
    concat['code_name'] = pd.Series(inf_code_list)
    MV_cv_ID = [181115, 195382,   128403,  100764,  139787,  129547,  120396,  193603, 196357, 156527,150835,156883,137829,120994,
                114047,162468,163525,199270,108640,125746,131525,183600,177082,157527,103061,189921,117340,109485,107962,188038,
                169344,117448,191001,144919,161350,106153,116756,148722,170826,159412,180006]
    print('Remove index', concat[concat['ID'].isin(MV_cv_ID)==True].index)
    print('length',len(concat[concat['ID'].isin(MV_cv_ID)==True].index))
    concat = concat.drop(concat[concat['ID'].isin(MV_cv_ID)==True].index,axis=0).reset_index(drop=True)
    print('Finished INPUTEVENTS_merged code_name processing.')
    return concat
    #  An alternative idea would have been to process the eICU data to extract the measurement unit information
    #  and put it in a separate column as another feature. However, while this is easy for the first 100-200 rows, 
    #  (as all measurements are contained in parentheses and can be regex-ed out), soon other drug names include
    #  the same string pattern for non-measurements, making automatic identification of units quite diffifult. 
    #  Naive code below:
    #
    # eicu_inf = pd.read_pickle(os.path.join(input_dir, 'eicu_inf_infusionDrug_init.pkl'))
    # test_eicu = eicu_inf.copy()
    # test_eicu['code_name_unit'] = pd.Series([[i[i.find('(')+1:i.find(')')] for i in test_eicu['code_name'][k]] \
    #                                   for k in range(len(test_eicu.index))])
    # test_eicu['mod_code_name'] = pd.Series([[re.sub(r'\ \([^)]*\)', '', i) for i in test_eicu['code_name'][k]] \
    #                                   for k in range(len(test_eicu.index))])
    # units = [i for k in range(len(test_eicu.index)) for i in test_eicu.code_name_unit[k]]
    # print(pd.Series(units).unique())
    # test_eicu.head()


# Single item to Merge items

In [14]:
def multiple_item_into_one(src, lab, med ,inf):
    print('{} lab item patient numbers .. {} '.format(src,len(lab)))
    print('{} med item patient numbers .. {} '.format(src,len(med)))
    print('{} inf item patient numbers .. {} '.format(src,len(inf)))
    lab.rename(columns = {'code_name':'lab_name','order_offset':'lab_offset'}, inplace=True)
    med.rename(columns = {'code_name':'med_name','order_offset':'med_offset'}, inplace=True)
    inf.rename(columns = {'code_name':'inf_name','order_offset':'inf_offset'}, inplace=True)
    ID_columns = [{'HADM_ID':'ID'}, {'patientunitstayid':'ID'}]
    if src == 'mimic':
        ID_column = ID_columns[0]
    if src == 'eicu':    
        ID_column = ID_columns[1]
    cohort_path = os.path.join(input_dir, src+'_cohort_dx_done.pk')
    cohort = pd.read_pickle(cohort_path).reset_index(drop=True)
    cohort.rename(columns = ID_column, inplace=True)
    print('cohort patient numbers .. {}'.format(len(cohort)))
    cohort= cohort[['ID', 'readmission','mortality','los>3day',
                    'los>7day','dx_depth1_unique']]
    merged_df = pd.merge(cohort, lab[['ID','lab_name','lab_offset']], on='ID', how='left')
    merged_df = pd.merge(merged_df, med[['ID','med_name','med_offset']], on='ID', how='left')
    merged_df = pd.merge(merged_df, inf[['ID','inf_name','inf_offset']], on='ID', how='left')
    merged_df = merged_df.reset_index(drop=True)
    print('{} three items merged patient numbers .. {} '.format(src, len(merged_df)))
    one_list=[]
    offset_list=[]
    item_index_list=[]
    lab = merged_df['lab_name'] ; lab_offset = merged_df['lab_offset']
    med = merged_df['med_name'] ; med_offset = merged_df['med_offset']
    inf= merged_df['inf_name'] ; inf_offset = merged_df['inf_offset']
    for i in range(len(merged_df)):
        ones = []
        offsets = []
        item_index = []
        if type(lab[i]) != type([]):
            labs = []
        else :
            labs =lab[i]
        if type(med[i]) != type([]):
            meds = []
        else :
            meds =med[i]
        if type(inf[i]) != type([]):
            infs = []
        else :
            infs =inf[i]
        if type(lab_offset[i]) != type([]):
            lab_offsets = []
        else :
            lab_offsets =lab_offset[i]
        if type(med_offset[i]) != type([]):
            med_offsets = []
        else :
            med_offsets =med_offset[i]
        if type(inf_offset[i]) != type([]):
            inf_offsets= []
        else :
            inf_offsets =inf_offset[i]            
        ones.extend(labs)
        ones.extend(meds)
        ones.extend(infs)
        offsets.extend(lab_offsets)
        offsets.extend(med_offsets)
        offsets.extend(inf_offsets)
        item_index.extend (list(np.full((len(labs)),1)))
        item_index.extend(list(np.full((len(meds)),2)))
        item_index.extend(list(np.full((len(infs)),3)))
        merge = zip(ones, offsets, item_index)
        sort = sorted(merge, key=lambda merge: merge[1], reverse=False)
        one_lists=[x for x,y,z in sort]
        one_offsets=[y for x,y,z in sort]
        one_item_index = [z for x,y,z in sort]
        one_list.append(one_lists)
        offset_list.append(one_offsets)
        item_index_list.append(one_item_index)
    one_name = pd.Series(one_list)
    one_offset = pd.Series(offset_list)
    one_item_index =pd.Series(item_index_list)
    merged_df['code_name'] = one_name
    merged_df['order_offset'] = one_offset
    merged_df['item_index'] = one_item_index
    return merged_df

# Preprocess 2nd

In [15]:
class Preprocess(): 
    def __init__(self, cohort, src:str, item:str, window, UNK, max_len, min_len, min_freq):
        self.cohort = cohort
        self.src = src
        self.UNK = UNK
        self.window = window
        self.min_len = min_len
        self.max_len = max_len
        self.min_freq = min_freq
        self.offset = 'order_offset'
        self.item = item
            
       
        if self.window=='Total':
            self.name_window = '{}_name'.format(self.item)
            self.offset_window = self.offset
            self.offset_order = 'offset_order'
        else:
            self.name_window = '{}_name_{}hr'.format(self.item, str(self.window))
            self.offset_window = '{}_{}hr'.format(self.offset, str(self.window))
            self.offset_order = '{}_offset_order_{}hr'.format(self.item, str(self.window))
    
    def timeoffset_window(self): 
        #(input: timeoffset -output:timeoffset_window):
        if self.window == 'Total':
                pass
        else:
            offset_window_lst = []
            code_name_window_lst = []
            item_index_window_lst=[]
            for idx, offset_lst in enumerate(self.cohort[self.offset]): # time since order, e.g. [182, 182, 403, 403, 403]
                len_offset_window = len([offset for offset in offset_lst if offset < self.window*60]) # how many < max_time 
                code_name_lst = self.cohort.code_name.iloc[idx] # which medically relevant thing [e.g. Tylenol 500mg, Epinephrine X mL]
                if self.item =='all':
                    item_index_lst = self.cohort.item_index.iloc[idx]
                    item_index_window = item_index_lst[:len_offset_window]
                    item_index_window_lst.append(item_index_window)
                offset_window = offset_lst[:len_offset_window]
                # truncate both at length of offset window
                code_name_window = code_name_lst[:len_offset_window]
                offset_window_lst.append(offset_window) # our new truncated window
                code_name_window_lst.append(code_name_window) # our new truncated codes
                
            self.cohort[self.name_window] = pd.Series(code_name_window_lst) # add as new column at end
            self.cohort[self.offset_window] = pd.Series(offset_window_lst)
            if self.item =='all':
                self.cohort['item_index'] = pd.Series(item_index_window_lst)
        return self.cohort
    
    def timeoffset_timeorder(self, cohort): 
        #(input- timeoffset - timeorder)
        offset_order_lst = []
        for idx, offset in enumerate(cohort[self.offset_window]):
            offset_set = list(set(offset)) # create a set from the interable e.g. {122, 232, 444}
            offset_set.sort() 
            order_value = np.arange(1, len(offset_set)+1)
            dict_offset = dict(zip(offset_set, order_value)) # create dictionary of "order" of events
            offset_order = list(pd.Series(offset).map(dict_offset)) 
            offset_order_lst.append(offset_order)
        cohort[self.offset_order] = pd.Series(offset_order_lst) # offset order is new col indicating ordinality
        return cohort
    
    def code_windowed(self, cohort, max_len, min_len):
        name_lst= []
        offset_lst = []
        offset_order_lst = []
        item_lst= []
        zero_len_idx=[]
        for idx, names in enumerate(self.cohort[self.name_window]): # our truncated code_name column
            len_name_window=len(names) # how many of these codes in our iteratred row?
            if len_name_window > max_len:
                 len_name_window = max_len
            if len_name_window < min_len:
                zero_len_idx.append(idx) 
            name = names[:len_name_window] # truncate to the max number of codes we're allowing
            offset = cohort[self.offset_window].iloc[idx][:len_name_window]
            if self.item =='all':
                item_index = cohort['item_index'].iloc[idx][:len_name_window]
                item_lst.append(item_index)
                # ditto for the [132, 132, 144, etc.]
           # offset_order = cohort[self.offset_order].iloc[idx][:len_name_window] # ditto to the order of events (1,2,3..)
            name_lst.append(name) # build series
            offset_lst.append(offset)
            #offset_order_lst.append(offset_order)    
        cohort[self.name_window] = pd.Series(name_lst) # replace columns from 1st fxn as necessary
        cohort[self.offset_window] = pd.Series(offset_lst)
        if self.item =='all': 
            cohort['item_index']=pd.Series(item_lst)
        #cohort[self.offset_order] = pd.Series(offset_order_lst)
        
        self.cohort = self.cohort.drop(self.cohort.index[[zero_len_idx]]).reset_index(drop=True) # drop if not enough time obs
        return self.cohort
    
                
    def make_vocab(self, cohort, min_freq, PAD_idx=0, UNK_idx=1, MASK_idx=2, SEP_idx=3): 
        #(Input codes output vocab with PAD 0 UNK 1 MASK 2 SEP 3)
        #2 options : delete UNK (min_freq) : False or treat min_freq as  UNK : True
       
        flatten = lambda l: [item for sublist in l for item in sublist]
        word_freq = dict(Counter(flatten(cohort[self.name_window])))
        
        if self.UNK == True:
            word2idx = {'<PAD>': PAD_idx, '<UNK>': UNK_idx, '<MASK>':MASK_idx, '<SEP>':SEP_idx} 
            
        elif self.UNK == False:
            word2idx = {'<PAD>': PAD_idx, '<MASK>':MASK_idx, '<SEP>':SEP_idx}
        

        min_freq_id=[]
        for word_item in word_freq.items():
            if word_item[0] not in word2idx:
                if word_item[1] < min_freq:
                    min_freq_id.append(word_item[0])
                    if self.UNK== True:
                        word2idx[word_item[0]]=1 #UNK 처리 
                else:
                    word2idx[word_item[0]] = max(word2idx.values())+1
                    
        return word2idx, min_freq_id, word_freq
    
    def code_to_index(self, cohort, word2idx, min_freq_id):
        #deleting min_freq word
        if self.UNK == False:
            dict_del={}
            for idx, name_lst in enumerate(cohort[self.name_window]):
                del_index=[i for i in range(len(name_lst)) if name_lst[i] in min_freq_id]              
                if len(del_index)>0:
                    dict_del[idx]=del_index 
            for idx, order in (dict_del.items()):
                item_deleted = [i for j, i in enumerate(cohort[self.name_window][idx]) if j not in order]
                offset_deleted = [i for j, i in enumerate(cohort[self.offset_window][idx]) if j not in order]
                if self.item =='all':
                    item_index_deleted = [i for j,i in enumerate(cohort['item_index'][idx]) if j not in order]
            #  offset_order_deleted= [i for j, i in enumerate(cohort[self.offset_order][idx]) if j not in order]
             #value_deleted
                #measure_deleted
                cohort[self.name_window].iloc[idx] = item_deleted
                cohort[self.offset_window].iloc[idx] = offset_deleted
                if self.item =='all':
                    cohort['item_index'].iloc[idx] = item_index_deleted
               # cohort[self.offset_order].iloc[idx] = offset_order_deleted
        #mapping
        item_id=[]    
        for name_lst in cohort[self.name_window]:
            item_id_lst=list(pd.Series(name_lst).map(word2idx))
            item_id.append(item_id_lst)
        item_id = pd.Series(item_id)
        cohort['{}_id_{}hr'.format(self.item, str(self.window))]=item_id
        
        return cohort
     
    def arguments(self):
        return  [self.src, self.window, self.item]
    
    def __call__(self):
        cohort = self.timeoffset_window()
        print('timeoffset windowing finish!')
        cohort = self.code_windowed(cohort, self.max_len, self.min_len)
        print('Patient filtering by code length finish!')
        word2idx, min_freq_id, word_freq = self.make_vocab(cohort, min_freq=self.min_freq, PAD_idx=0, UNK_idx=1, MASK_idx=2, SEP_idx=3)
        print('Vocab making finish!')
        cohort = self.code_to_index(cohort, word2idx, min_freq_id)
        cohort = self.timeoffset_timeorder(cohort)
        return cohort, word2idx, min_freq_id, word_freq

In [16]:
src = 'mimic'
item='all'
seed = 2020
columns_map = mimic_columns_map
window_time = 12
UNK = False
max_len = 150
min_len = 5
min_freq = 5
item_list =['lab','med','inf']
source_list = ['mimic','eicu']

# Preprocessing run

In [17]:
'''
Preparation csv file into csv_dir : MIMIC-III : 
                                    eICU : 
Preprocess steps :  1. Preprocess_1 (data_init, list_preparation)  
                    2. Intermediate step_1(medication align & INPUTEVENTS MERGE) 
                    3. Intermediate step_2(Three item merged)
                    4. Preprocess_2 (make vocab, pruning patient)
'It will take more than 30mins'
'''

"\nPreparation csv file into csv_dir : MIMIC-III : \n                                    eICU : \nPreprocess steps :  1. Preprocess_1 (data_init, list_preparation)  \n                    2. Intermediate step_1(medication align & INPUTEVENTS MERGE) \n                    3. Intermediate step_2(Three item merged)\n                    4. Preprocess_2 (make vocab, pruning patient)\n'It will take more than 30mins'\n"

In [24]:
for src in source_list:
    for item in item_list:
        if src == 'mimic':
            files = mimic_csv_files[item] # the files from mimic that we want
            columns_map= mimic_columns_map # the columns from mimic we care about, to be arg for pre_processing_1st
        elif src == 'eicu':     
            files = eicu_csv_files[item]
            columns_map= eicu_columns_map 
        for file in files:
            print('data preparation initialization .. {} {}'.format(src, item))
            data = data_init(file, src, item, columns_map)
            df, cohort = data()
            list_prep = list_preparation(df, cohort, src, item, file, columns_map)
            pickle_cohort= list_prep()
            if item == 'lab':
                lab = pickle_cohort.copy()
            elif item =='med':
                med = pickle_cohort.copy()
                med = medication_align(src, med)
            elif item =='inf':
                if src =='mimic':
                    if file =='INPUTEVENTS_CV':
                        ie_cv = pickle_cohort.copy()
                    elif file == 'INPUTEVENTS_MV':
                        ie_mv = pickle_cohort.copy()
                        inf = Merge_INPUTEVENTS(ie_cv, ie_mv)
                elif src=='eicu':
                    inf=pickle_cohort.copy()
    print('data preparation finish for three items \n second preparation start soon..')
    print('preprocessing for first part.. {}_{}_{} finish!'.format(src,item,file))
    merged_df =multiple_item_into_one(src, lab, med ,inf)
    merged_df= merged_df.drop(columns=['lab_name', 'lab_offset', 'med_name', 'med_offset','inf_name','inf_offset'])
    print('lab med inf three categories merged in one!')
    print('2nd Preprocessing start!...') 
    second_preprocess = Preprocess(merged_df, src, item, window_time, UNK, max_len, min_len, min_freq)
    cohort, vocab, min_freq_id, word_freq = second_preprocess()
    print('Preprocessing completed.')
    print('Writing', '{}_{}_{}_{}.pkl'.format(src, window_time, 'all', max_len), 'to', input_dir)
    cohort.to_pickle(os.path.join(input_dir,'{}_{}_{}_{}.pkl'.format(src, window_time, all, max_len)))
    print('Generated vocabulary of length', len(vocab), '\n')

data preparation initialization .. mimic lab
cohort load finish!
csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
name_dict start!
name_dict finish!
time_filter start!


  series = pd.Series()


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18625.0), HTML(value='')))


time_filter finished!
charttime_offset start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18625.0), HTML(value='')))


charttime_offset finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18625.0), HTML(value='')))


list_make_sort finished!
data preparation initialization .. mimic med
cohort load finish!


  if (await self.run_code(code, result,  async_=asy)):


csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
name_dict start!
name_dict finish!
time_filter start!


  series = pd.Series()


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17578.0), HTML(value='')))


time_filter finished!
charttime_offset start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17578.0), HTML(value='')))


charttime_offset finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17578.0), HTML(value='')))


list_make_sort finished!
Finished mimic med code_name/amount/units concatenation.
data preparation initialization .. mimic inf
cohort load finish!


  if (await self.run_code(code, result,  async_=asy)):


csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
name_dict start!
name_dict finish!
time_filter start!


  series = pd.Series()


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9366.0), HTML(value='')))


time_filter finished!
charttime_offset start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9366.0), HTML(value='')))


charttime_offset finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9366.0), HTML(value='')))


list_make_sort finished!
data preparation initialization .. mimic inf
cohort load finish!
csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
name_dict start!
name_dict finish!
time_filter start!


  series = pd.Series()


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9125.0), HTML(value='')))


time_filter finished!
charttime_offset start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9125.0), HTML(value='')))


charttime_offset finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9125.0), HTML(value='')))


list_make_sort finished!
Beginning INPUTEVENTS CV-MV merge...
ie_cv length 9366
ie_mv length 9125
Files concatenated.
merged length 18491
Finished INPUTEVENTS CV-MV concatenation.
Beginning INPUTEVENTS_merged code_name processing...
index Int64Index([   80,   286,   589,   790,   866,   957,  1369,  1613,  1683,
             1692,  1951,  1998,  2459,  2711,  2819,  2998,  3582,  3764,
             4216,  4532,  4763,  5294,  5322,  5387,  5542,  5730,  5833,
             5934,  6476,  6607,  7202,  7474,  7585,  7807,  8252,  8436,
             8538,  8781,  8940,  9024,  9308,  9433,  9634,  9914, 10060,
            10100, 10186, 10595, 10832, 10905, 10914, 11176, 11221, 11644,
            11916, 12009, 12190, 12759, 12940, 13375, 13729, 13926, 14468,
            14504, 14573, 14725, 14904, 15002, 15112, 15667, 15821, 16420,
            16688, 16788, 17016, 17388, 17559, 17664, 17897, 18059, 18163,
            18416],
           dtype='int64')
length 82
Finished INPUTEVENTS_merged c

  result = getitem(key)


Patient filtering by code length finish!
Vocab making finish!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing mimic_12_inf_150.pkl to ./input
Generated vocabulary of length 2858 

data preparation initialization .. eicu lab
cohort load finish!
csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
time_filter start!
time_filter finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13249.0), HTML(value='')))


list_make_sort finished!
data preparation initialization .. eicu med
cohort load finish!


  if (await self.run_code(code, result,  async_=asy)):


csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
time_filter start!
time_filter finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11710.0), HTML(value='')))


list_make_sort finished!
Finished mimic med code_name/amount/units concatenation.
data preparation initialization .. eicu inf
cohort load finish!


  if (await self.run_code(code, result,  async_=asy)):


csv file load finish!
column_rename start!
column_rename finish!
cohort_filtering start!
cohort_filtering finish!
issue_delete start!
issue_delete finish!
time_filter start!
time_filter finished!
list_make_sort start!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6273.0), HTML(value='')))


list_make_sort finished!
data preparation finish for three items 
 second preparation start soon..
preprocessing for first part.. eicu_inf_infusionDrug finish!
eicu lab item patient numbers .. 13249 
eicu med item patient numbers .. 11710 
eicu inf item patient numbers .. 6273 
cohort patient numbers .. 13364
eicu three items merged patient numbers .. 13364 
lab med inf three categories merged in one!
2nd Preprocessing start!...
timeoffset windowing finish!


  result = getitem(key)


Patient filtering by code length finish!
Vocab making finish!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing eicu_12_inf_150.pkl to ./input
Generated vocabulary of length 2021 



# Train & valid & Test split

In [49]:
def train_valid_test_split(target_cols, cohort, random_state, test_ratio, train_valid_fold, columns):
    sss_train_test = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state =random_state)
    mskf = MultilabelStratifiedKFold(n_splits=train_valid_fold, random_state=random_state)
    X = cohort.copy()
    for target in target_cols:
        if target == 'dx':
            continue
        print("____{}____ train and test split start!".format(target))
        fold_column = '{}_fold'.format(target)
        X[fold_column]=1
        print('X fold_column \n',X[fold_column].value_counts())
        y = X[target]

        #train / test split 4:1
        for train_index, test_index in sss_train_test.split(X,y):
            X_test = X.loc[test_index]
            X_test.loc[test_index, fold_column]=0    

        #train = -1 , test = 0
        X_train = X.loc[train_index].reset_index(drop=True)
        y_train = y.loc[train_index].reset_index(drop=True)

        #train / valid slpit 4:1 
        for train_index, valid_index in sss_train_test.split(X_train,y_train):
            X_valid = X_train.loc[valid_index]
            X_valid.loc[valid_index, fold_column]=2


        #test = 0 train = 1 valid = 2
        X_train = X_train.loc[train_index].reset_index(drop=True)
        X_valid = X_valid.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)

        X = pd.concat([X_train, X_valid, X_test], axis=0).reset_index(drop=True)

        print('fold_split_results!!!!! \n', X[fold_column].value_counts())

        '''  
        Multi_label_stratified_Kfold
        ''' 
    #diagnosis multi_label stratified_Kfold
    print('___diagnosis multi_label_stratified_split____') 
    X['dx_fold'] = 1
    y = X[columns]
    for i, (train_index, test_index) in enumerate(mskf.split(X,y)):
        if i != 0 :
            continue
        elif i== 0:
            trn_index = train_index
            X_test=X.loc[test_index]
            X_test.loc[test_index, 'dx_fold'] = 0

    #X_train 
    X_train = X.loc[trn_index].reset_index(drop=True)
    y_train = y.loc[trn_index].reset_index(drop=True)
    for i, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        if i != 0 :
            continue
        elif i== 0:
            trn_index = train_index
            X_valid= X_train.loc[valid_index]
            X_valid.loc[valid_index, 'dx_fold'] = 2

    #test = 0 train = 1 valid = 2
    X_train = X_train.loc[trn_index].reset_index(drop=True)
    X_valid = X_valid.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    X = pd.concat([X_train, X_valid,X_test]).reset_index(drop=True)
    print('fold_split_results!!!!! \n', X['dx_fold'].value_counts())
    return X

In [63]:
def dx_one_hot_encoding(cohort):
    #dx code -> one-hot encoding
    dx_label_criteria = 'dx_depth1_unique'
    mlb = MultiLabelBinarizer()
    encoded_dx = mlb.fit_transform(cohort[dx_label_criteria])
    columns=mlb.classes_
    df_dx = pd.DataFrame(encoded_dx, columns=columns)
    cohort = pd.concat([cohort, df_dx],axis=1)
    return cohort, columns

In [51]:
source_list = ['mimic','eicu']
target_cols = ['readmission', 'mortality', 'los>3day', 'los>7day']
random_state_list = [2020,2021,2022,2023,2024,2025,2026,2027,2028,2029]
# train / test ratio
test_ratio = 0.2
# train/ valid folds
train_valid_fold = 5

In [38]:
for src in source_list:
    cohort = pd.read_pickle(os.path.join(input_dir, '{}_12_all_150.pkl'.format(src))).reset_index(drop=True)
    cohort, columns = dx_one_hot_encoding(cohort)
    for random_state in random_state_list:
        cohort_fold_split =train_valid_test_split(target_cols, cohort, random_state, test_ratio, train_valid_fold, columns)
        cohort_fold_split= cohort_fold_split.drop(columns=columns)
        save_file = '{}_12_all_150_{}.pkl'.format(src, random_state)
        print('save to .. ', os.path.join(input_dir, save_file))
        cohort_fold_split.to_pickle(os.path.join(input_dir, save_file))



____readmission____ train and test split start!
X fold_column 
 1    18536
Name: readmission_fold, dtype: int64
fold_split_results!!!!! 
 1    11862
0     3708
2     2966
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
X fold_column 
 1    18536
Name: mortality_fold, dtype: int64
fold_split_results!!!!! 
 1    11862
0     3708
2     2966
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
X fold_column 
 1    18536
Name: los>3day_fold, dtype: int64
fold_split_results!!!!! 
 1    11862
0     3708
2     2966
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!
X fold_column 
 1    18536
Name: los>7day_fold, dtype: int64
fold_split_results!!!!! 
 1    11862
0     3708
2     2966
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____
fold_split_results!!!!! 
 1    11856
0     3708
2     2972
Name: dx_fold, dtype: int64


KeyboardInterrupt: 

# Fewshot sampling

In [52]:
def Fewshot_sampling(target_cols, cohort, random_state, sampled_ratio, sampling_fold,train_valid_fold, columns):
    sss_sampling = StratifiedShuffleSplit(n_splits=1, test_size=sampled_ratio, random_state =random_state)
    sss=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state =random_state)
    mskf_sampling = MultilabelStratifiedKFold(n_splits=10, random_state=random_state)
    mskf = MultilabelStratifiedKFold(n_splits=5, random_state=random_state)
    X = cohort.copy()
    print('len(cohort)',len(X))
    print('sampling ratio', sampled_ratio)
    for target in target_cols:
        print("____{}____ train and test split start!".format(target))
        fold_column = '{}_fold'.format(target)
        #print('X fold_column \n',X[fold_column].value_counts())
        X_test = X.loc[X[fold_column]==0].reset_index(drop=True)
        X_train = X.loc[X[fold_column].isin([1,2])]
        #print('X_train \n' ,X_train[fold_column].value_counts())
        y_train = X_train[target].reset_index(drop=True)
        X_train[fold_column] = -1
        X_train = X_train.reset_index(drop=True)
        
        #Sampling from X_train/valid
        for unsampled_train_index, sampled_train_index in sss_sampling.split(X_train,y_train):
            X_unsampled=X_train.loc[unsampled_train_index].reset_index(drop=True)
            X_sampled = X_train.loc[sampled_train_index]
            X_sampled.loc[sampled_train_index, fold_column]=1
            X_sampled=X_sampled.reset_index(drop=True)
            y_sampled = y_train.loc[sampled_train_index].reset_index(drop=True)
        
        #train = 1 , valid = 2, test = 0
       # print('X_sampled \n', X_sampled[fold_column].value_counts())
        for train_index, valid_index in sss.split(X_sampled,y_sampled):
            X_valid = X_sampled.loc[valid_index]
            X_valid.loc[valid_index, fold_column]=2
            X_train = X_sampled.loc[train_index]

            
        X_train = X_train.reset_index(drop=True)
        X_valid = X_valid.reset_index(drop=True)

        X = pd.concat([X_train, X_unsampled, X_valid, X_test], axis=0).reset_index(drop=True)

        print('fold_split_results!!!!! \n', X[fold_column].value_counts())
    
    '''  
    Multi_label_stratified_Kfold
    ''' 
    #diagnosis multi_label stratified_Kfold
    print('___diagnosis multi_label_stratified_split____') 
    X_test = X.loc[X['dx_fold']==0].reset_index(drop=True)
    #print('X_test',len(X_test))
    X_train = X.loc[X['dx_fold'].isin([1,2])].reset_index(drop=True)
    X_train['dx_fold'] = -1
    y_train = X_train[columns].reset_index(drop=True)
   # print('X_train',len(X_train))
    for i, (unsampled_train_index, sampled_train_index) in enumerate(mskf_sampling.split(X_train,y_train)):
            X_train.loc[sampled_train_index, 'dx_fold']=i+1

    sampling_i = [i+1 for i in range(sampling_fold)]
    X_unsampled = X_train.loc[~X_train['dx_fold'].isin(sampling_i)].reset_index(drop=True)
    X_unsampled['dx_fold']=-1
    #print('samping_i', sampling_i)
    #print('X_unsampled', X_unsampled['dx_fold'].value_counts())
    X_sampled = X_train.loc[X_train['dx_fold'].isin(sampling_i)]
    X_sampled['dx_fold'] = 1
    #print('index len' ,len(X_sampled.index))
    y_sampled = y_train.loc[X_sampled.index].reset_index(drop=True)
    X_sampled = X_sampled.reset_index(drop=True)
    #print('X_sampled',len(X_sampled))        
    
    #print('X_sampled',X_sampled['dx_fold'].value_counts())
    for i, (train_index, valid_index) in enumerate(mskf.split(X_sampled, y_sampled)):
        if i != 0 :
            continue
        elif i==0:
            X_valid = X_sampled.loc[valid_index]
            X_valid.loc[valid_index, 'dx_fold']=2
            X_train = X_sampled.loc[train_index]
    
    #test = 0 train = 1 valid = 2
    X = pd.concat([X_train, X_unsampled, X_valid,X_test]).reset_index(drop=True)
    print('fold_split_results!!!!! \n', X['dx_fold'].value_counts())
    
    return X



In [53]:
source_list = ['mimic','eicu']
items = ['all']
time_window=12
max_len =150
target_cols = ['readmission', 'mortality', 'los>3day', 'los>7day']
random_state_list = [2020,2021,2022,2023,2024,2025,2026,2027,2028,2029]
# train/ valid folds 4:1
train_valid_fold = 5
sampling_fold_list = [1,3,5,7,9]
sampled_ratio_list = [0.1,0.3,0.5,0.7,0.9]

In [64]:
for src in source_list:
    print('src_{}_start!'.format(src))
    for item in items:
        for random_state in random_state_list:
            print('seed_{}'.format(random_state))
            for i, sampled_ratio in enumerate(sampled_ratio_list):
                sampling_fold = sampling_fold_list[i]
                print('read data from', os.path.join(input_dir, '{}_{}_{}_{}_{}.pkl'.format(src,time_window,item,max_len,random_state)))
                cohort = pd.read_pickle(os.path.join(input_dir, '{}_{}_{}_{}_{}.pkl'.format(src,time_window,item,max_len,random_state))).reset_index(drop=True)
                cohort, columns = dx_one_hot_encoding(cohort)
                cohort_fold_split =Fewshot_sampling(target_cols, cohort, random_state, sampled_ratio, sampling_fold, train_valid_fold, columns)
                cohort_fold_split= cohort_fold_split.drop(columns=columns)
                #save_file = '{}_{}_{}_{}_{}_{}_concat.pkl'.format(src,time_window,item, max_len,random_state, int(sampled_ratio*100))
                save_file = '{}_{}_{}_{}_{}_{}.pkl'.format(src,time_window,item, max_len,random_state, int(sampled_ratio*100))
                cohort_fold_split.to_pickle(os.path.join(input_dir, save_file))

src_mimic_start!
seed_2020
read data from /home/ghhur/data/input/mimic_12_all_150_2020.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

len(cohort) 18536
sampling ratio 0.1
____readmission____ train and test split start!
fold_split_results!!!!! 
 -1    13345
 0     3708
 1     1186
 2      297
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
fold_split_results!!!!! 
 -1    13345
 0     3708
 1     1186
 2      297
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
fold_split_results!!!!! 
 -1    13345
 0     3708
 1     1186
 2      297
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!
fold_split_results!!!!! 
 -1    13345
 0     3708
 1     1186
 2      297
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sampled['dx_fold'] = 1


fold_split_results!!!!! 
 -1    13354
 0     3708
 1     1181
 2      293
Name: dx_fold, dtype: int64
read data from /home/ghhur/data/input/mimic_12_all_150_2020.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

len(cohort) 18536
sampling ratio 0.3
____readmission____ train and test split start!
fold_split_results!!!!! 
 -1    10379
 0     3708
 1     3559
 2      890
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
fold_split_results!!!!! 
 -1    10379
 0     3708
 1     3559
 2      890
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
fold_split_results!!!!! 
 -1    10379
 0     3708
 1     3559
 2      890
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!
fold_split_results!!!!! 
 -1    10379
 0     3708
 1     3559
 2      890
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sampled['dx_fold'] = 1


fold_split_results!!!!! 
 -1    10360
 0     3708
 1     3565
 2      903
Name: dx_fold, dtype: int64
read data from /home/ghhur/data/input/mimic_12_all_150_2020.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1


len(cohort) 18536
sampling ratio 0.5
____readmission____ train and test split start!
fold_split_results!!!!! 
 -1    7414
 1    5931
 0    3708
 2    1483
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
fold_split_results!!!!! 
 -1    7414
 1    5931
 0    3708
 2    1483
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
fold_split_results!!!!! 
 -1    7414
 1    5931
 0    3708
 2    1483
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1


fold_split_results!!!!! 
 -1    7414
 1    5931
 0    3708
 2    1483
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sampled['dx_fold'] = 1


fold_split_results!!!!! 
 -1    7427
 1    5915
 0    3708
 2    1486
Name: dx_fold, dtype: int64
read data from /home/ghhur/data/input/mimic_12_all_150_2020.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1


len(cohort) 18536
sampling ratio 0.7
____readmission____ train and test split start!
fold_split_results!!!!! 
  1    8304
-1    4448
 0    3708
 2    2076
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
fold_split_results!!!!! 
  1    8304
-1    4448
 0    3708
 2    2076
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
fold_split_results!!!!! 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1


  1    8304
-1    4448
 0    3708
 2    2076
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!
fold_split_results!!!!! 
  1    8304
-1    4448
 0    3708
 2    2076
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sampled['dx_fold'] = 1


fold_split_results!!!!! 
  1    8288
-1    4469
 0    3708
 2    2071
Name: dx_fold, dtype: int64
read data from /home/ghhur/data/input/mimic_12_all_150_2020.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1


len(cohort) 18536
sampling ratio 0.9
____readmission____ train and test split start!
fold_split_results!!!!! 
  1    10676
 0     3708
 2     2670
-1     1482
Name: readmission_fold, dtype: int64
____mortality____ train and test split start!
fold_split_results!!!!! 
  1    10676
 0     3708
 2     2670
-1     1482
Name: mortality_fold, dtype: int64
____los>3day____ train and test split start!
fold_split_results!!!!! 
  1    10676
 0     3708
 2     2670
-1     1482
Name: los>3day_fold, dtype: int64
____los>7day____ train and test split start!
fold_split_results!!!!! 
  1    10676
 0     3708
 2     2670
-1     1482
Name: los>7day_fold, dtype: int64
___diagnosis multi_label_stratified_split____


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[fold_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_sampled['dx_fold'] = 1


fold_split_results!!!!! 
  1    10675
 0     3708
 2     2650
-1     1503
Name: dx_fold, dtype: int64
seed_2021
read data from /home/ghhur/data/input/mimic_12_all_150_2021.pkl


FileNotFoundError: [Errno 2] No such file or directory: '/home/ghhur/data/input/mimic_12_all_150_2021.pkl'

# Multi processing for split and few shot sampling

In [73]:
item = 'all'
random_state_list = [2020,2021,2022,2023,2024,2025,2026,2027,2028,2029]
def multi_for_split(random_state):      
    for src in source_list:
        cohort = pd.read_pickle(os.path.join(input_dir, '{}_12_all_150.pkl'.format(src))).reset_index(drop=True)
        cohort, columns = dx_one_hot_encoding(cohort)
        cohort_fold_split =train_valid_test_split(target_cols, cohort, random_state, test_ratio, train_valid_fold, columns)
        cohort_fold_split= cohort_fold_split.drop(columns=columns)
        save_file = '{}_12_all_150_{}.pkl'.format(src, random_state)
        print('save to .. ', os.path.join(input_dir, save_file))
        cohort_fold_split.to_pickle(os.path.join(input_dir, save_file))
        
def multi_for_fewshot(random_state):
    for src in source_list:
        print('src_{}_start!'.format(src))
        for i, sampled_ratio in enumerate(sampled_ratio_list):
            sampling_fold = sampling_fold_list[i]
            cohort = pd.read_pickle(os.path.join(input_dir, '{}_{}_{}_{}_{}.pkl'.format(src,time_window,item,max_len,random_state))).reset_index(drop=True)
            cohort, columns = dx_one_hot_encoding(cohort)
            cohort_fold_split =Fewshot_sampling(target_cols, cohort, random_state, sampled_ratio, sampling_fold, train_valid_fold, columns)
            cohort_fold_split= cohort_fold_split.drop(columns=columns)
            #save_file = '{}_{}_{}_{}_{}_{}_concat.pkl'.format(src,time_window,item, max_len,random_state, int(sampled_ratio*100))
            save_file = '{}_{}_{}_{}_{}_{}.pkl'.format(src,time_window,item, max_len,random_state, int(sampled_ratio*100))
            cohort_fold_split.to_pickle(os.path.join(input_dir, save_file))


In [74]:
multi_process = Parallel(n_jobs=-1, verbose=100)(delayed(multi_for_split)(random_state) for random_state in random_state_list) 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:  1.3min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.3min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  1.3min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.3min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.3min remaining:   51.0s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.3min remaining:   33.0s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:  1.3min remaining:   19.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [75]:
multi_process = Parallel(n_jobs=-1, verbose=100)(delayed(multi_for_fewshot)(random_state) for random_state in random_state_list)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:  4.4min remaining: 17.6min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  4.5min remaining: 10.5min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  4.5min remaining:  6.8min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  4.6min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  4.7min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  4.7min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:  4.7min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.8min finished
