# MIMIC 4 data - dataset construction prescriptions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
import pandas as pd
from datetime import datetime
import numpy as np

In [3]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [4]:
path_data = p_project + '/data/original/mimic4'
path_temp = p_project + '/data/mimic4'

In [5]:
adm = pd.read_csv(path_temp + '/processed/tables/admissions_processed.csv')

In [6]:
# only choose previously selected admission ids
presc=pd.read_csv(path_data + '/hosp/prescriptions.csv.gz')
adm_ids=list(adm['hadm_id'])
presc=presc.loc[presc['hadm_id'].isin(adm_ids)]

print('Number of patients remaining in the database: ')
print(presc['subject_id'].nunique())
presc.tail()

  exec(code_obj, self.user_global_ns, self.user_ns)


Number of patients remaining in the database: 
44057


Unnamed: 0,subject_id,hadm_id,pharmacy_id,starttime,stoptime,drug_type,drug,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
17008042,17587395,29827111,68950197,2130-04-18 08:00:00,2130-04-18 11:00:00,MAIN,INV-Cabozantinib (Cabozantinib),70388.0,42388000000.0,20 mg Tablet,,20,mg,1,TAB,1.0,ORAL
17008043,17587395,29827111,31601255,2130-04-18 08:00:00,2130-04-23 10:00:00,MAIN,INV-Cabozantinib (Cabozantinib),70388.0,42388000000.0,20 mg Tablet,,20,mg,1,TAB,1.0,ORAL
17008044,17587395,29827111,23581378,2130-04-23 11:00:00,2130-05-21 07:00:00,MAIN,INV-Cabozantinib (Cabozantinib),70388.0,42388000000.0,20 mg Tablet,,40,mg,2,TAB,1.0,ORAL
17008049,15906963,20494713,9933337,2164-11-17 12:00:00,2164-11-18 11:00:00,MAIN,Fentanyl Citrate,41384.0,409909300.0,100mcg/2mL Amp,,25-100,mcg,0.5-2,mL,1.0,IV
17008051,17961555,22879995,50647416,2171-01-24 05:00:00,2171-02-03 20:00:00,MAIN,Acetaminophen-Caff-Butalbital,4451.0,591336900.0,1 Tablet,,1-2,TAB,1-2,TAB,,PO


In [7]:
# take only the n most used items
n_best=20
pat_for_item=presc.groupby('drug')['subject_id'].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]

print(frequent_labels)

drug
Sodium Chloride 0.9%  Flush    42564
0.9% Sodium Chloride           38380
Acetaminophen                  36894
Magnesium Sulfate              35644
Heparin                        34172
Potassium Chloride             33721
Bag                            30606
Docusate Sodium                30602
Senna                          29900
5% Dextrose                    29224
Insulin                        28890
Iso-Osmotic Dextrose           28389
Calcium Gluconate              28332
Dextrose 50%                   27926
Glucagon                       25967
Bisacodyl                      24802
Furosemide                     24701
Metoprolol Tartrate            23264
Ondansetron                    23252
Vancomycin                     23075
Name: subject_id, dtype: int64


In [8]:
#Select entries whose drug name is in the list from the paper.
drugs_list=['Acetaminophen', 'Aspirin','Bisacodyl','Insulin','Heparin','Docusate Sodium','D5W','Potassium Chloride','Magnesium Sulfate','Metoprolol Tartrate','Sodium Chloride 0.9%  Flush','Pantoprazole']
presc2=presc.loc[presc['drug'].isin(drugs_list)]

print('Number of patients remaining in the database: ')
print(presc2['subject_id'].nunique())

Number of patients remaining in the database: 
44045


In [9]:
print(presc2.groupby('drug')['dose_unit_rx'].value_counts())

drug                         dose_unit_rx
Acetaminophen                mg              108526
                             g                   25
                             gm                   1
Aspirin                      mg               40510
Bisacodyl                    mg               54841
D5W                          mL               28727
                             SYR                 31
                             ml                   8
                             BTL                  2
Docusate Sodium              mg               68021
                             mL                   1
Heparin                      UNIT             81700
                             units               81
                             Units/Liter          2
                             mg                   1
Insulin                      UNIT            222294
                             units                5
Magnesium Sulfate            gm              145797
                      

In [10]:
#Units correction
presc2=presc2.drop(presc2.loc[presc2['dose_unit_rx'].isnull()].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Acetaminophen')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2.loc[(presc2['drug']=='D5W')&(presc2['dose_unit_rx']=='ml'),'dose_unit_rx']='mL'
presc2=presc2.drop(presc2.loc[(presc2['drug']=='D5W')&(presc2['dose_unit_rx']!='mL')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Heparin')&(presc2['dose_unit_rx']!='UNIT')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Insulin')&(presc2['dose_unit_rx']!='UNIT')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Magnesium Sulfate')&(presc2['dose_unit_rx']!='gm')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Potassium Chloride')&(presc2['dose_unit_rx']!='mEq')].index).copy()
presc2.loc[(presc2['drug']=='Sodium Chloride 0.9%  Flush')&(presc2['dose_unit_rx']=='ml'),'dose_unit_rx']='mL'
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Bisacodyl')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Pantoprazole')&(presc2['dose_unit_rx']!='mg')].index).copy()

# added by Jingge
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Docusate Sodium')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Metoprolol Tartrate')&(presc2['dose_unit_rx']!='mg')].index).copy()

print(presc2.groupby('drug')['dose_unit_rx'].value_counts())

drug                         dose_unit_rx
Acetaminophen                mg              108526
Aspirin                      mg               40510
Bisacodyl                    mg               54841
D5W                          mL               28735
Docusate Sodium              mg               68021
Heparin                      UNIT             81700
Insulin                      UNIT            222294
Magnesium Sulfate            gm              145797
Metoprolol Tartrate          mg              136834
Pantoprazole                 mg               38063
Potassium Chloride           mEq             238028
Sodium Chloride 0.9%  Flush  mL              165658
Name: dose_unit_rx, dtype: int64


### Check for outliers

#### 1) In amounts

In [11]:
type(presc2["dose_val_rx"])

#We need to transform the value columns in float type.
original_num_entries=len(presc2.index)

presc2.dropna(subset=['dose_val_rx'], inplace=True)
presc2.drop(presc2.loc[presc2['dose_val_rx'].str.contains("'")].index, inplace=True)

#First transform the ranges (xx-yy) as the mean of the ranges.
range_df=presc2.loc[presc2["dose_val_rx"].str.contains("-")].copy()
range_df["First_digit"]=range_df["dose_val_rx"].str.split("-").str[0]
range_df.loc[range_df["First_digit"]=="",'First_digit'] = 0.0
range_df["First_digit"] = range_df["First_digit"].astype(float)
range_df["Second_digit"]=range_df["dose_val_rx"].str.split("-").str[1]
range_df.loc[range_df["Second_digit"]=="",'Second_digit']=range_df.loc[range_df["Second_digit"]=="",'First_digit']
range_df["Second_digit"]=range_df["Second_digit"].astype(float)
range_df["mean"]=(range_df["First_digit"]+range_df["Second_digit"])/2
range_df["dose_val_rx"]=range_df["mean"]
range_df.drop(columns=["First_digit","Second_digit","mean"],inplace=True)

#Now remove the entries with the - from the original df and force conversion to float.
presc3=presc2.drop(presc2.loc[presc2["dose_val_rx"].str.contains("-")].index).copy()
presc3["dose_val_rx"]=pd.to_numeric(presc2["dose_val_rx"], errors="coerce")
presc3.dropna(subset=["dose_val_rx"],inplace=True)

presc2=presc3.append(range_df)

print("Lost entries in the process : {}".format(original_num_entries-len(presc2.index)))


Lost entries in the process : 247


In [12]:
#To avoid confounding labels with labels from other tables, we add 'drug' to the name
presc2['charttime']=pd.to_datetime(presc2['starttime'], format='%Y-%m-%d %H:%M:%S')
presc2['drug']=presc2['drug']+' Drug'

In [13]:
presc2.head()

Unnamed: 0,subject_id,hadm_id,pharmacy_id,starttime,stoptime,drug_type,drug,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,charttime
144,12315540,23248664,94413048,2172-12-03 20:00:00,2172-12-05 10:00:00,MAIN,Heparin Drug,6549,63323030000.0,5000 Units / mL- 1mL Vial,,5000.0,UNIT,1,mL,3.0,SC,2172-12-03 20:00:00
150,12315540,23248664,68412815,2172-12-03 23:00:00,2172-12-04 22:00:00,BASE,D5W Drug,1972,409792300.0,100mL Bag,,100.0,mL,100,mL,1.0,IV,2172-12-03 23:00:00
152,12315540,23248664,92297162,2172-12-03 23:00:00,2172-12-04 12:00:00,BASE,D5W Drug,1972,338001700.0,250mL Bag,,250.0,mL,250,mL,1.0,IV,2172-12-03 23:00:00
154,12315540,23248664,47490519,2172-12-03 23:00:00,2172-12-04 12:00:00,BASE,D5W Drug,1972,338001700.0,250mL Bag,,250.0,mL,250,mL,1.0,IV,2172-12-03 23:00:00
162,12315540,23248664,34102962,2172-12-04 11:00:00,2172-12-05 10:00:00,BASE,D5W Drug,1972,338001700.0,50mL Bag,,50.0,mL,50,mL,1.0,IV,2172-12-04 11:00:00


In [14]:
presc2.to_csv(path_temp + '/processed/tables/prescriptions_processed.csv')