# MIMIC 4 data - dataset construction prescriptions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib
import pandas as pd

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [3]:
path_data = p_project + '/data/mimic4'

In [4]:
adm = pd.read_csv(path_data + '/processed/admissions_processed.csv')

In [5]:
# only choose previously selected admission ids
presc=pd.read_csv(path_data + '/raw/hosp/prescriptions.csv.gz')
adm_ids=list(adm['hadm_id'])
presc=presc.loc[presc['hadm_id'].isin(adm_ids)]

print('Number of patients remaining in the database: ')
print(presc['subject_id'].nunique())
presc.tail()

  presc=pd.read_csv(path_data + '/raw/hosp/prescriptions.csv.gz')


Number of patients remaining in the database: 
42007


Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,formulary_drug_cd,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
15416703,19999987,23865745,92839339,19999987-25,25.0,P5542X,2145-11-03 00:00:00,2145-11-03 18:00:00,MAIN,Influenza Virus Vaccine,INFL0.5LF,69637.0,33332000000.0,0.5 mL Syringe,,0.5,mL,1,SYR,0.0,IM
15416704,19999987,23865745,92913309,19999987-150,150.0,P63OX3,2145-11-04 10:00:00,2145-11-11 17:00:00,MAIN,Levothyroxine Sodium,LEVO50,6649.0,74455210.0,50mcg Tablet,,50.0,mcg,1,TAB,1.0,PO/NG
15416705,19999987,23865745,95753195,19999987-219,219.0,P77Z1Y,2145-11-10 10:00:00,2145-11-11 17:00:00,MAIN,Duloxetine,DULO30,57892.0,2324033.0,30mg Capsule,,60.0,mg,2,CAP,1.0,PO
15416706,19999987,23865745,96343043,19999987-206,206.0,P77Z1Y,2145-11-09 10:00:00,2145-11-09 16:00:00,MAIN,Venlafaxine XR,VENL150XR,46405.0,68084050000.0,150mg XR Capsule,,150.0,mg,1,CAP,1.0,PO
15416707,19999987,23865745,98044699,19999987-207,207.0,P77Z1Y,2145-11-09 10:00:00,2145-11-09 16:00:00,MAIN,Duloxetine,DULO30,57892.0,2324033.0,30mg Capsule,,60.0,mg,2,CAP,1.0,PO


In [6]:
# take only the n most used items
n_best=20
pat_for_item=presc.groupby('drug')['subject_id'].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]

print(frequent_labels)

drug
Sodium Chloride 0.9%  Flush    40494
0.9% Sodium Chloride           36422
Acetaminophen                  35230
Magnesium Sulfate              33800
Heparin                        32592
Potassium Chloride             32006
Docusate Sodium                29278
Bag                            29037
Senna                          28529
5% Dextrose                    27715
Insulin                        27471
Iso-Osmotic Dextrose           26991
Calcium Gluconate              26838
Dextrose 50%                   26547
Glucagon                       24690
Bisacodyl                      23709
Furosemide                     23438
Ondansetron                    22261
Metoprolol Tartrate            22096
Vancomycin                     21872
Name: subject_id, dtype: int64


In [7]:
#Select entries whose drug name is in the list from the paper.
drugs_list=['Acetaminophen', 'Aspirin','Bisacodyl','Insulin','Heparin','Docusate Sodium','D5W','Potassium Chloride','Magnesium Sulfate','Metoprolol Tartrate','Sodium Chloride 0.9%  Flush','Pantoprazole']
presc2=presc.loc[presc['drug'].isin(drugs_list)]

print('Number of patients remaining in the database: ')
print(presc2['subject_id'].nunique())

Number of patients remaining in the database: 
41996


In [8]:
print(presc2.groupby('drug')['dose_unit_rx'].value_counts())

drug                         dose_unit_rx
Acetaminophen                mg              103508
                             g                   25
                             gm                   1
Aspirin                      mg               38558
Bisacodyl                    mg               52391
D5W                          mL               27283
                             SYR                 30
                             ml                   8
                             BTL                  2
Docusate Sodium              mg               64963
                             mL                   1
Heparin                      UNIT             77669
                             units               81
                             Units/Liter          2
                             mg                   1
Insulin                      UNIT            211028
                             units                4
Magnesium Sulfate            gm              138331
                      

In [9]:
#Units correction
presc2=presc2.drop(presc2.loc[presc2['dose_unit_rx'].isnull()].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Acetaminophen')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2.loc[(presc2['drug']=='D5W')&(presc2['dose_unit_rx']=='ml'),'dose_unit_rx']='mL'
presc2=presc2.drop(presc2.loc[(presc2['drug']=='D5W')&(presc2['dose_unit_rx']!='mL')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Heparin')&(presc2['dose_unit_rx']!='UNIT')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Insulin')&(presc2['dose_unit_rx']!='UNIT')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Magnesium Sulfate')&(presc2['dose_unit_rx']!='gm')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Potassium Chloride')&(presc2['dose_unit_rx']!='mEq')].index).copy()
presc2.loc[(presc2['drug']=='Sodium Chloride 0.9%  Flush')&(presc2['dose_unit_rx']=='ml'),'dose_unit_rx']='mL'
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Bisacodyl')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Pantoprazole')&(presc2['dose_unit_rx']!='mg')].index).copy()

# added by Jingge
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Docusate Sodium')&(presc2['dose_unit_rx']!='mg')].index).copy()
presc2=presc2.drop(presc2.loc[(presc2['drug']=='Metoprolol Tartrate')&(presc2['dose_unit_rx']!='mg')].index).copy()

print(presc2.groupby('drug')['dose_unit_rx'].value_counts())

drug                         dose_unit_rx
Acetaminophen                mg              103508
Aspirin                      mg               38558
Bisacodyl                    mg               52391
D5W                          mL               27291
Docusate Sodium              mg               64963
Heparin                      UNIT             77669
Insulin                      UNIT            211028
Magnesium Sulfate            gm              138331
Metoprolol Tartrate          mg              129719
Pantoprazole                 mg               36093
Potassium Chloride           mEq             225718
Sodium Chloride 0.9%  Flush  mL              157273
Name: dose_unit_rx, dtype: int64


### Check for outliers

#### 1) In amounts

In [10]:
type(presc2["dose_val_rx"])

#We need to transform the value columns in float type.
original_num_entries=len(presc2.index)

presc2.dropna(subset=['dose_val_rx'], inplace=True)
presc2.drop(presc2.loc[presc2['dose_val_rx'].str.contains("'")].index, inplace=True)

#First transform the ranges (xx-yy) as the mean of the ranges.
range_df=presc2.loc[presc2["dose_val_rx"].str.contains("-")].copy()
range_df["First_digit"]=range_df["dose_val_rx"].str.split("-").str[0]
range_df.loc[range_df["First_digit"]=="",'First_digit'] = 0.0
range_df["First_digit"] = range_df["First_digit"].astype(float)
range_df["Second_digit"]=range_df["dose_val_rx"].str.split("-").str[1]
range_df.loc[range_df["Second_digit"]=="",'Second_digit']=range_df.loc[range_df["Second_digit"]=="",'First_digit']
range_df["Second_digit"]=range_df["Second_digit"].astype(float)
range_df["mean"]=(range_df["First_digit"]+range_df["Second_digit"])/2
range_df["dose_val_rx"]=range_df["mean"]
range_df.drop(columns=["First_digit","Second_digit","mean"],inplace=True)

#Now remove the entries with the - from the original df and force conversion to float.
presc3=presc2.drop(presc2.loc[presc2["dose_val_rx"].str.contains("-")].index).copy()
presc3["dose_val_rx"]=pd.to_numeric(presc2["dose_val_rx"], errors="coerce")
presc3.dropna(subset=["dose_val_rx"],inplace=True)

presc2=presc3.append(range_df)

print("Lost entries in the process : {}".format(original_num_entries-len(presc2.index)))


  presc2=presc3.append(range_df)


Lost entries in the process : 231


In [11]:
#To avoid confounding labels with labels from other tables, we add 'drug' to the name
presc2['charttime']=pd.to_datetime(presc2['starttime'], format='%Y-%m-%d %H:%M:%S')
presc2['drug']=presc2['drug']+' Drug'

In [12]:
presc2.head()

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,formulary_drug_cd,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,charttime
925,10001217,24597018,1314750,10001217-117,117.0,P43NQA,2157-11-19 01:00:00,2157-11-25 22:00:00,MAIN,Sodium Chloride 0.9% Flush Drug,NACLFLUSH,,0.0,10 mL Syringe,,3.0,mL,0.3,SYR,3.0,IV,2157-11-19 01:00:00
937,10001217,24597018,33156765,10001217-120,120.0,P43NQA,2157-11-19 01:00:00,2157-11-25 22:00:00,MAIN,Bisacodyl Drug,BISA5,2947.0,536338100.0,5 mg Tab,,10.0,mg,2.0,TAB,1.0,PO,2157-11-19 01:00:00
938,10001217,24597018,35518916,10001217-119,119.0,P43NQA,2157-11-19 01:00:00,2157-11-25 22:00:00,MAIN,Docusate Sodium Drug,DOCU100,3009.0,904224500.0,100mg Capsule,,100.0,mg,1.0,CAP,2.0,PO/NG,2157-11-19 01:00:00
939,10001217,24597018,37864542,10001217-120,120.0,P43NQA,2157-11-19 01:00:00,2157-11-25 22:00:00,MAIN,Bisacodyl Drug,BISA10R,2944.0,574705000.0,10mg Suppository,,10.0,mg,1.0,SUPP,1.0,PR,2157-11-19 01:00:00
940,10001217,24597018,43331807,10001217-126,126.0,P25UJS,2157-11-21 08:00:00,2157-11-25 22:00:00,MAIN,Heparin Drug,HEPA5I,6549.0,63323030000.0,5000 Units / mL- 1mL Vial,,5000.0,UNIT,1.0,mL,3.0,SC,2157-11-21 08:00:00


In [13]:
presc2.to_csv(path_data + '/processed/prescriptions_processed.csv')