In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import re
import time

### Outline ###
#
# 1) Concatenate PRESCRIPTIONS (mimic med) drug name, amount, and units of measurement into a single variable (we'll
#     replace the previous code_name with this to maintain consistency across scripts)
#         - Input: one .pkl file created by Preprocessed_1.ipynb (mimic_med_PRESCRIPTIONS_init.pkl) 
#         - Output: one .pkl file to output_dir (mimic_med_PRESCRIPTIONS_init.pkl)
#
# 2) Merge the INPUTEVENTS_CV and INPUTEVENTS_MV information along the common variables they share from 
#     running through Preprocessed_1.ipynb
#         - Input: two .pkl files created by Preprocessed_1.ipynb (mimic_inf_INPUTEVENTS_CV_init.pkl; mimic_inf_INPUTEVENTS_MV_init.pkl)
#         - Output: a DataFrame named < concat >, stored in memory for (3)
#
# 3) Process INPUTEVENTS from MIMIC to approximate the eICU infusion_drug file's code_name. That is, concatenate
#     drug name and dosage units of measurement into one variable
#         - Input: the < concat > DataFrame generated by (2)
#         - Output: a .pkl file to output_dir
#

input_dir = '../../../output/PrePr1_output_Wes/'
output_dir = '../../../output/PrePr1-5_output_Wes/'


In [2]:
### 1: Concat mimic med
print('Beginning mimic med code_name/amount/units concatenation...')
print('Inputdir is: ', input_dir, '  ;  ', 'Output dir is: ', output_dir) 

presc = pd.read_pickle(os.path.join(input_dir, 'mimic_med_PRESCRIPTIONS_init.pkl'))

'''
Recall: 
        'PRESCRIPTIONS':{'HADM_ID':'ID','STARTDATE':'start_time', 'ENDDATE':'end_time', 
                         'DRUG':'code_name','DOSE_VAL_RX':'value','DOSE_UNIT_RX':'value_uom',
                         'ROUTE':'route', 'DRUG_TYPE':'drug_type','FORM_VAL_DISP':'val_disp' ,
                         'FORM_UNIT_DISP':'unit_disp'}
                         
--> We care about: code_name, value, value_uom
'''

presc = presc.rename(columns={'code_name':'code_name_old'})

presc['code_name'] = pd.Series([[i+' '+str(j)+' '+k+' '+l for i,j,k,l in zip(presc['code_name_old'].iloc[z], \
                                                                     presc['value'].iloc[z], \
                                                                     presc['value_uom'].iloc[z], \
                                                                     presc['route'].iloc[z])] \
                               for z in range(len(presc.index))])

presc = presc.drop('code_name_old', axis=1)
presc.to_pickle(os.path.join(output_dir, 'mimic_med_PRESCRIPTIONS_init.pkl'))

print('Output file: < mimic_med_PRESCRIPTIONS_init.pkl > to ', output_dir)
print('Finished mimic med code_name/amount/units concatenation.')

Beginning mimic med code_name/amount/units concatenation...
Inputdir is:  ../../../output/PrePr1_output_Wes/   ;   Output dir is:  ../../../output/PrePr1-5_output_Wes/
Output file: < mimic_med_PRESCRIPTIONS_init.pkl > to  ../../../output/PrePr1-5_output_Wes/
Finished mimic med code_name/amount/units concatenation.


In [3]:
### 2: Merge INPUTEVENTS _CV and _MV

print('Beginning INPUTEVENTS CV-MV merge...')

print('Reading files...') # these are big files, so we'll flag this process (could take a few seconds locally)
ie_cv = pd.read_pickle(os.path.join(input_dir, 'mimic_inf_INPUTEVENTS_CV_init.pkl'))
ie_mv = pd.read_pickle(os.path.join(input_dir, 'mimic_inf_INPUTEVENTS_MV_init.pkl'))
print('Files read.')

concat = pd.concat([ie_cv, ie_mv])
print('Files concatenated.')

# General practice for concatenating: check columns
not_intersection_mv = [col for col in concat.columns if col not in ie_mv.columns]
not_intersection_cv = [col for col in concat.columns if col not in ie_cv.columns]

if not_intersection_mv != [] or not_intersection_cv != []:
    print('Not all columns in new file shared between original files.')
    print(' Cols not in CV: ', not_intersection_cv, '\n Cols not in MV: ', not_intersection_mv)

# Uncomment if part (3) is not being run    
# concat.to_pickle(os.path.join(output_dir, 'mimic_inf_INPUTEVENTS_merged_init.pkl'))
# print('Output file: < mimic_inf_INPUTEVENTS_merged_init.pkl > to ', output_dir)

print('Finished INPUTEVENTS CV-MV concatenation.')



Beginning INPUTEVENTS CV-MV merge...
Reading files...
Files read.
Files concatenated.
Not all columns in new file shared between original files.
 Cols not in CV:  ['start_time', 'end_time', 'patient_weight'] 
 Cols not in MV:  []
Finished INPUTEVENTS CV-MV concatenation.


In [4]:
### 3: Process INPUTEVENTS from MIMIC

start = time.time()
print('Beginning INPUTEVENTS_merged code_name processing...')

concat['code_name_new'] = pd.Series([[(str(i)+' ' \
                                           +('('+str(j)+')').replace('(null)','')+' '\
                                           +('('+str(k)+')').replace('(null)','')).replace('  ',' ').strip() \
                              for i,j,k in zip(concat['code_name'].iloc[z], \
                                              concat['value_uom'].iloc[z], \
                                              concat['rateuom'].iloc[z])] \
                               for z in range(len(concat.index))])
concat = concat.drop('code_name', axis=1)
concat = concat.rename(columns={'code_name_new':'code_name'})
checkpoint1 = time.time()
print('New code_name generated after', round((checkpoint1-start),3), 'seconds.')


print('Writing file...') # seems that the line below takes quite a bit of time! (though, it is a 500MB file)
concat.to_pickle(os.path.join(output_dir, 'mimic_inf_INPUTEVENTS_merged_init.pkl'))

checkpoint2 = time.time()
print('Output file < mimic_inf_INPUTEVENTS_merged_init.pkl > to', output_dir, \
      'after', round((checkpoint2-start),3), 'seconds.')

print('Finished INPUTEVENTS_merged code_name processing.')

#  An alternative idea would have been to process the eICU data to extract the measurement unit information
#  and put it in a separate column as another feature. However, while this is easy for the first 100-200 rows, 
#  (as all measurements are contained in parentheses and can be regex-ed out), soon other drug names include
#  the same string pattern for non-measurements, making automatic identification of units quite diffifult. 
#  Naive code below:
#
# eicu_inf = pd.read_pickle(os.path.join(input_dir, 'eicu_inf_infusionDrug_init.pkl'))
# test_eicu = eicu_inf.copy()
# test_eicu['code_name_unit'] = pd.Series([[i[i.find('(')+1:i.find(')')] for i in test_eicu['code_name'][k]] \
#                                   for k in range(len(test_eicu.index))])
# test_eicu['mod_code_name'] = pd.Series([[re.sub(r'\ \([^)]*\)', '', i) for i in test_eicu['code_name'][k]] \
#                                   for k in range(len(test_eicu.index))])
# units = [i for k in range(len(test_eicu.index)) for i in test_eicu.code_name_unit[k]]
# print(pd.Series(units).unique())
# test_eicu.head()

Beginning INPUTEVENTS_merged code_name processing...
New code_name generated after 5.703 seconds.
Writing file...
Output file < mimic_inf_INPUTEVENTS_merged_init.pkl > to ../../../output/PrePr1-5_output_Wes/ after 116.386 seconds.
Finished INPUTEVENTS_merged code_name processing.
