# MIMIC 4 data - dataset construction labevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib
import pandas as pd

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [3]:
path_data = p_project + '/data/mimic4'

In [4]:
adm = pd.read_csv(path_data + '/processed/admissions_processed.csv')
adm.head()

Unnamed: 0.1,Unnamed: 0,hadm_id,icu_los,subject_id,anchor_age,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,2,24597018,1.118032,10001217,55,2157-11-18 22:56:00,2157-11-25 18:00:00,,EW EMER.,P4645A,EMERGENCY ROOM,HOME HEALTH CARE,Other,?,MARRIED,WHITE,2157-11-18 17:38:00,2157-11-19 01:24:00,0
1,4,25563031,1.338588,10001725,46,2110-04-11 15:08:00,2110-04-14 15:00:00,,EW EMER.,P35SU0,PACU,HOME,Other,ENGLISH,MARRIED,WHITE,,,0
2,5,26184834,9.171817,10001884,68,2131-01-07 20:39:00,2131-01-20 05:15:00,2131-01-20 05:15:00,OBSERVATION ADMIT,P874LG,EMERGENCY ROOM,DIED,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2131-01-07 13:36:00,2131-01-07 22:13:00,1
3,6,23581541,1.314352,10002013,53,2160-05-18 07:45:00,2160-05-23 13:30:00,,SURGICAL SAME DAY ADMISSION,P47E1G,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,SINGLE,OTHER,,,0
4,8,23822395,6.178912,10002155,80,2129-08-04 12:44:00,2129-08-18 16:53:00,,EW EMER.,P05HUO,PROCEDURE SITE,CHRONIC/LONG TERM ACUTE CARE,Other,ENGLISH,MARRIED,WHITE,2129-08-04 11:00:00,2129-08-04 12:35:00,0


In [None]:
df = pd.DataFrame()
for chunk in pd.read_csv(path_data + '/raw/hosp/labevents.csv.gz', chunksize=500000):
    adm_ids=list(adm['hadm_id'])
    chunk=chunk.loc[chunk['hadm_id'].isin(adm_ids)]
    df = pd.concat([df, chunk[['subject_id', 'hadm_id', 'charttime', 'valuenum', 'itemid']]])

In [6]:
# only choose previously selected admission ids.
print('Number of patients remaining in the database: ')
print(df['subject_id'].nunique())

Number of patients remaining in the database: 
41986


In [7]:
# get item ids
item_id=pd.read_csv(path_data + '/raw/hosp/d_labitems.csv.gz')
item_id_1=item_id[['itemid','label']]
item_id_1.head()

Unnamed: 0,itemid,label
0,50801,Alveolar-arterial Gradient
1,50802,Base Excess
2,50803,"Calculated Bicarbonate, Whole Blood"
3,50804,Calculated Total CO2
4,50805,Carboxyhemoglobin


In [8]:
# get names of administered items
lab2=pd.merge(df,item_id_1,on='itemid')
lab2.head()
print('Number of patients remaining in the database: ')
print(lab2['subject_id'].nunique())

Number of patients remaining in the database: 
41986


In [9]:
# get only top 150 most used tests
n_best=150
pat_for_item=lab2.groupby('label')['subject_id'].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]
lab3=lab2.loc[lab2['label'].isin(list(frequent_labels.index))].copy()

print('Number of patients remaining in the database: ')
print(lab3['subject_id'].nunique())

Number of patients remaining in the database: 
41985


In [10]:
frequent_labels.head(50)

label
Glucose                            41959
Sodium                             41952
Potassium                          41952
Chloride                           41951
Creatinine                         41950
Urea Nitrogen                      41948
Anion Gap                          41948
Bicarbonate                        41948
Hematocrit                         41939
Platelet Count                     41936
Hemoglobin                         41934
MCHC                               41933
White Blood Cells                  41933
MCH                                41933
RDW                                41933
MCV                                41933
Red Blood Cells                    41933
Magnesium                          41884
Phosphate                          41128
Calcium, Total                     41090
PT                                 39998
INR(PT)                            39998
PTT                                39878
pH                                 36508
Specimen T

In [11]:
# only select the subset that was used in the paper (only missing is INR(PT))
subset=['Albumin','Alanine Aminotransferase (ALT)','Alkaline Phosphatase','Anion Gap','Asparate Aminotransferase (AST)','Base Excess','Basophils','Bicarbonate','Bilirubin, Total','Calcium, Total','Calculated Total CO2','Chloride','Creatinine','Eosinophils','Glucose','Hematocrit','Hemoglobin',
'Lactate','Lymphocytes','MCH','MCV','Magnesium','Monocytes','Neutrophils','PT','PTT','Phosphate','Platelet Count','Potassium','RDW','Red Blood Cells','Sodium','Specific Gravity','Urea Nitrogen','White Blood Cells','pCO2','pH','pO2']

lab3=lab3.loc[lab3['label'].isin(subset)].copy()

In [12]:
lab3['label'].unique()

array(['Monocytes', 'Creatinine', 'Hematocrit', 'Hemoglobin', 'MCH',
       'MCV', 'Platelet Count', 'RDW', 'Red Blood Cells',
       'White Blood Cells', 'PT', 'PTT', 'Anion Gap', 'Bicarbonate',
       'Calcium, Total', 'Chloride', 'Glucose', 'Magnesium', 'Phosphate',
       'Potassium', 'Sodium', 'Urea Nitrogen', 'Basophils', 'Eosinophils',
       'Lymphocytes', 'Neutrophils', 'pH', 'Specific Gravity',
       'Alanine Aminotransferase (ALT)', 'Alkaline Phosphatase',
       'Asparate Aminotransferase (AST)', 'Bilirubin, Total',
       'Base Excess', 'Calculated Total CO2', 'Lactate', 'pCO2', 'pO2',
       'Albumin'], dtype=object)

In [13]:
lab3.to_csv(path_data + '/processed/lab_processed.csv')