In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import notebook
from collections import Counter

input_1_dir = '../../../output/PrePr1_output_Wes/' # This is a bit messy -- let's clean this up later. 
input_1_5_dir = '../../../output/PrePr1-5_output_Wes/' # We don't really need a separate directory for each step
output_dir = '../../../output/PrePr2_output_Wes/'

In [2]:
sources = ['mimic', 'eicu']
items = ['lab','med','inf']
window_time = 12
UNK = False
max_len = 150
min_freq = 5

In [3]:
mimic_files = {'lab':'LABEVENTS', 'med':'PRESCRIPTIONS', 'inf':'INPUTEVENTS_merged'}

eicu_files = {'dx':'diagnosis', 'lab':'lab', 'med':'medication', 
                   'trt':'treatment', 'inf':'infusionDrug'}

In [4]:
class Preprocess(): 
    def __init__(self, cohort, src:str, item:str, window, UNK, max_len, min_freq):
        self.cohort = cohort
        self.src = src
        self.UNK = UNK
        self.window = window
        self.max_len = max_len
        self.min_freq = min_freq
        #이부분 수정 필요
        self.offset = 'order_offset'
        self.item = item
            
       
        if self.window=='Total':
            self.name_window = '{}_name'.format(self.item)
            self.offset_window = self.offset
            self.offset_order = 'offset_order'
        else:
            self.name_window = '{}_name_{}hr'.format(self.item, str(self.window))
            self.offset_window = '{}_{}hr'.format(self.offset, str(self.window))
            self.offset_order = '{}_offset_order_{}hr'.format(self.item, str(self.window))
    
    def timeoffset_window(self): 
        #(input: timeoffset -output:timeoffset_window):
        if self.window == 'Total':
                pass
        else:
            offset_window_lst = []
            code_name_window_lst = []
            for idx, offset_lst in enumerate(self.cohort[self.offset]): # time since order, e.g. [182, 182, 403, 403, 403]
                len_offset_window = len([offset for offset in offset_lst if offset < self.window*60]) # how many < max_time 
                code_name_lst = self.cohort.code_name.iloc[idx] # which medically relevant thing [e.g. Tylenol 500mg, Epinephrine X mL]
                offset_window = offset_lst[:len_offset_window] # truncate both at length of offset window
                code_name_window = code_name_lst[:len_offset_window]
                offset_window_lst.append(offset_window) # our new truncated window
                code_name_window_lst.append(code_name_window) # our new truncated codes
            self.cohort[self.name_window] = pd.Series(code_name_window_lst) # add as new column at end
            self.cohort[self.offset_window] = pd.Series(offset_window_lst)
        return self.cohort
    
    def timeoffset_timeorder(self, cohort): 
        #(input- timeoffset - timeorder)
        offset_order_lst = []
        for idx, offset in enumerate(cohort[self.offset_window]):
            offset_set = list(set(offset)) # create a set from the interable e.g. {122, 232, 444}
            offset_set.sort() 
            order_value = np.arange(1, len(offset_set)+1)
            dict_offset = dict(zip(offset_set, order_value)) # create dictionary of "order" of events
            offset_order = list(pd.Series(offset).map(dict_offset)) 
            offset_order_lst.append(offset_order)
        cohort[self.offset_order] = pd.Series(offset_order_lst) # offset order is new col indicating ordinality
        return cohort
    
    def code_windowed(self, cohort, max_len, min_len):
        name_lst= []
        offset_lst = []
        offset_order_lst = []
        zero_len_idx=[]
        for idx, names in enumerate(self.cohort[self.name_window]): # our truncated code_name column
            len_name_window=len(names) # how many of these codes in our iteratred row?
            if len_name_window > max_len:
                 len_name_window = max_len
            if len_name_window < min_len:
                zero_len_idx.append(idx) 
            name = names[:len_name_window] # truncate to the max number of codes we're allowing
            offset = cohort[self.offset_window].iloc[idx][:len_name_window] # ditto for the [132, 132, 144, etc.]
            offset_order = cohort[self.offset_order].iloc[idx][:len_name_window] # ditto to the order of events (1,2,3..)
            name_lst.append(name) # build series
            offset_lst.append(offset)
            offset_order_lst.append(offset_order)    
        cohort[self.name_window] = pd.Series(name_lst) # replace columns from 1st fxn as necessary
        cohort[self.offset_window] = pd.Series(offset_lst)
        cohort[self.offset_order] = pd.Series(offset_order_lst)
        
        self.cohort = self.cohort.drop(self.cohort.index[[zero_len_idx]]).reset_index(drop=True) # drop if not enough time obs
        return self.cohort
    
                
    def make_vocab(self, cohort, min_freq=5, PAD_idx=0, UNK_idx=1, MASK_idx=2, SEP_idx=3): 
        #(Input codes output vocab with PAD 0 UNK 1 MASK 2 SEP 3)
        #2 options : delete UNK (min_freq) : False or treat min_freq as  UNK : True
       
        flatten = lambda l: [item for sublist in l for item in sublist]
        word_freq = dict(Counter(flatten(cohort[self.name_window])))
        
        if self.UNK == True:
            word2idx = {'<PAD>': PAD_idx, '<UNK>': UNK_idx, '<MASK>':MASK_idx, '<SEP>':SEP_idx} 
            
        elif self.UNK == False:
            word2idx = {'<PAD>': PAD_idx, '<MASK>':MASK_idx, '<SEP>':SEP_idx}
        

        min_freq_id=[]
        for word_item in word_freq.items():
            if word_item[0] not in word2idx:
                if word_item[1] < min_freq:
                    min_freq_id.append(word_item[0])
                    if self.UNK== True:
                        word2idx[word_item[0]]=1 #UNK 처리 
                else:
                    word2idx[word_item[0]] = max(word2idx.values())+1
                    
        return word2idx, min_freq_id
    
    def code_to_index(self, cohort, word2idx, min_freq_id):
        #deleting min_freq word
        if self.UNK == False:
            dict_del={}
            for idx, name_lst in enumerate(cohort[self.name_window]):
                del_index=[i for i in range(len(name_lst)) if name_lst[i] in min_freq_id]              
                if len(del_index)>0:
                    dict_del[idx]=del_index 
            for idx, order in (dict_del.items()):
                item_deleted = [i for j, i in enumerate(cohort[self.name_window][idx]) if j not in order]
                offset_deleted = [i for j, i in enumerate(cohort[self.offset_window][idx]) if j not in order]
                offset_order_deleted= [i for j, i in enumerate(cohort[self.offset_order][idx]) if j not in order]
             #value_deleted
                #measure_deleted
                cohort[self.name_window].iloc[idx] = item_deleted
                cohort[self.offset_window].iloc[idx] = offset_deleted
                cohort[self.offset_order].iloc[idx] = offset_order_deleted
        #mapping
        item_id=[]    
        for name_lst in cohort[self.name_window]:
            item_id_lst=list(pd.Series(name_lst).map(word2idx))
            item_id.append(item_id_lst)
        item_id = pd.Series(item_id)
        cohort['{}_id_{}hr'.format(self.item, str(self.window))]=item_id
        
        return cohort
     
    def arguments(self):
        return  [self.src, self.window, self.item]
    
    def __call__(self):
        cohort = self.timeoffset_window()
        cohort = self.timeoffset_timeorder(cohort)
        cohort = self.code_windowed(cohort, self.max_len, self.min_freq)
        word2idx, min_freq_id = self.make_vocab(cohort, min_freq=self.min_freq, PAD_idx=0, UNK_idx=1, MASK_idx=2, SEP_idx=3)
        cohort = self.code_to_index(cohort, word2idx, min_freq_id)
        return cohort, word2idx

In [5]:
for src in sources: 
    for item in items: 
        if src=='mimic':
            file = mimic_files[item]
        if src=='eicu':
            file = eicu_files[item]
        
        filename = '{}_{}_{}_init.pkl'.format(src,item,file)
        
        if src=='mimic' and (item=='inf' or item=='med'):
            input_dir = input_1_5_dir
        else: 
            input_dir = input_1_dir
        
        print('Reading cohort file', filename, 'from', input_dir, '...')
        cohort = pd.read_pickle(os.path.join(input_dir,filename))
        print('File read.')
        
        print('Preprocessing...') 
        df = Preprocess(cohort, src, item, window_time, UNK, max_len, min_freq)
        cohort, vocab = df()
        print('Preprocessing completed.')
        
        print('Writing', '{}_{}_{}_{}.pkl'.format(src, window_time, item, max_len), 'to', output_dir)
        cohort.to_pickle(os.path.join(output_dir,'{}_{}_{}_{}.pkl'.format(src, window_time, item, max_len)))
        
        # As an extra check: 
        print('Generated vocabulary of length', len(vocab), '\n')
        

Reading cohort file mimic_lab_LABEVENTS_init.pkl from ../../../output/PrePr1_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing mimic_12_lab_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 362 

Reading cohort file mimic_med_PRESCRIPTIONS_init.pkl from ../../../output/PrePr1-5_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
  item_id_lst=list(pd.Series(name_lst).map(word2idx))


Preprocessing completed.
Writing mimic_12_med_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 1933 

Reading cohort file mimic_inf_INPUTEVENTS_merged_init.pkl from ../../../output/PrePr1-5_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing mimic_12_inf_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 350 

Reading cohort file eicu_lab_lab_init.pkl from ../../../output/PrePr1_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing eicu_12_lab_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 136 

Reading cohort file eicu_med_medication_init.pkl from ../../../output/PrePr1_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Preprocessing completed.
Writing eicu_12_med_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 962 

Reading cohort file eicu_inf_infusionDrug_init.pkl from ../../../output/PrePr1_output_Wes/ ...
File read.
Preprocessing...


  offset_order = list(pd.Series(offset).map(dict_offset))
  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
  item_id_lst=list(pd.Series(name_lst).map(word2idx))


Preprocessing completed.
Writing eicu_12_inf_150.pkl to ../../../output/PrePr2_output_Wes/
Generated vocabulary of length 565 

