# MIMIC 4 data - dataset construction labevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [4]:
path_data = p_project + '/data/original/mimic4'
path_temp = p_project + '/data/mimic4'

In [5]:
adm = pd.read_csv(path_temp + '/processed/tables/admissions_processed.csv')
adm.head()

Unnamed: 0.1,Unnamed: 0,hadm_id,icu_los,subject_id,anchor_age,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,0,24528534,1.587454,17867402,25,2154-03-03 03:09:00,2154-03-04 16:30:00,,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,,UNABLE TO OBTAIN,2154-03-03 02:49:00,2154-03-03 04:11:00,0
1,1,28960964,3.025625,14435996,42,2150-06-19 13:07:00,2150-06-25 15:36:00,,OBSERVATION ADMIT,TRANSFER FROM HOSPITAL,HOME,Medicaid,ENGLISH,SINGLE,WHITE,2150-06-19 11:54:00,2150-06-19 17:57:00,0
2,2,27385897,9.741725,17609946,70,2138-02-05 17:42:00,2138-02-15 11:00:00,2138-02-15 11:00:00,OBSERVATION ADMIT,EMERGENCY ROOM,DIED,Medicare,ENGLISH,SINGLE,WHITE,2138-02-05 15:44:00,2138-02-05 18:54:00,1
3,4,20817525,1.674769,12776735,72,2200-07-11 22:46:00,2200-07-19 12:00:00,,OBSERVATION ADMIT,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,MARRIED,OTHER,2200-07-11 15:27:00,2200-07-12 00:33:00,0
4,5,24283593,1.292697,10215159,67,2124-09-20 15:04:00,2124-09-26 14:30:00,,EW EMER.,EMERGENCY ROOM,HOME,Medicare,ENGLISH,DIVORCED,WHITE,2124-09-20 12:52:00,2124-09-20 17:21:00,0


In [6]:
df = pd.DataFrame()
for chunk in pd.read_csv(path_data + '/hosp/labevents.csv.gz', chunksize=500000):
    adm_ids=list(adm['hadm_id'])
    chunk=chunk.loc[chunk['hadm_id'].isin(adm_ids)]
    df = df.append(chunk[['subject_id','hadm_id','charttime','valuenum','itemid']])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# only choose previously selected admission ids.
print('Number of patients remaining in the database: ')
print(df['subject_id'].nunique())

Number of patients remaining in the database: 
43956


In [8]:
# get item ids
item_id=pd.read_csv(path_data + '/hosp/d_labitems.csv.gz')
item_id_1=item_id[['itemid','label']]
item_id_1.head()

Unnamed: 0,itemid,label
0,51905,
1,51532,11-Deoxycorticosterone
2,51957,17-Hydroxycorticosteroids
3,51958,"17-Ketosteroids, Urine"
4,52068,24 Hr


In [9]:
# get names of administered items
lab2=pd.merge(df,item_id_1,on='itemid')
lab2.head()
print('Number of patients remaining in the database: ')
print(lab2['subject_id'].nunique())

Number of patients remaining in the database: 
43956


In [10]:
# get only top 150 most used tests
n_best=150
pat_for_item=lab2.groupby('label')['subject_id'].nunique()
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]
lab3=lab2.loc[lab2['label'].isin(list(frequent_labels.index))].copy()

print('Number of patients remaining in the database: ')
print(lab3['subject_id'].nunique())

Number of patients remaining in the database: 
43955


In [11]:
frequent_labels.head(50)

label
Glucose                            43929
Sodium                             43920
Potassium                          43920
Chloride                           43919
Creatinine                         43918
Anion Gap                          43916
Urea Nitrogen                      43916
Bicarbonate                        43916
Hematocrit                         43907
Platelet Count                     43904
Hemoglobin                         43902
White Blood Cells                  43901
MCV                                43901
MCHC                               43901
MCH                                43901
RDW                                43901
Red Blood Cells                    43901
Magnesium                          43852
Phosphate                          43057
Calcium, Total                     43012
PT                                 41879
INR(PT)                            41879
PTT                                41754
pH                                 38303
Specimen T

In [12]:
# only select the subset that was used in the paper (only missing is INR(PT))
subset=['Albumin','Alanine Aminotransferase (ALT)','Alkaline Phosphatase','Anion Gap','Asparate Aminotransferase (AST)','Base Excess','Basophils','Bicarbonate','Bilirubin, Total','Calcium, Total','Calculated Total CO2','Chloride','Creatinine','Eosinophils','Glucose','Hematocrit','Hemoglobin',
'Lactate','Lymphocytes','MCH','MCV','Magnesium','Monocytes','Neutrophils','PT','PTT','Phosphate','Platelet Count','Potassium','RDW','Red Blood Cells','Sodium','Specific Gravity','Urea Nitrogen','White Blood Cells','pCO2','pH','pO2']

lab3=lab3.loc[lab3['label'].isin(subset)].copy()

In [13]:
lab3['label'].unique()

array(['PT', 'PTT', 'Basophils', 'Eosinophils', 'Hematocrit',
       'Hemoglobin', 'Lymphocytes', 'MCH', 'MCV', 'Monocytes',
       'Neutrophils', 'RDW', 'Red Blood Cells', 'White Blood Cells',
       'Anion Gap', 'Calcium, Total', 'Chloride', 'Creatinine',
       'Magnesium', 'Phosphate', 'Potassium', 'Urea Nitrogen',
       'Base Excess', 'Calculated Total CO2', 'pCO2', 'pO2', 'Lactate',
       'Alanine Aminotransferase (ALT)', 'Alkaline Phosphatase',
       'Asparate Aminotransferase (AST)', 'Bilirubin, Total',
       'Platelet Count', 'pH', 'Bicarbonate', 'Sodium', 'Albumin',
       'Specific Gravity', 'Glucose'], dtype=object)

In [14]:
lab3.to_csv(path_temp + '/processed/tables/lab_processed.csv')