# MIMIC 4 data - dataset construction chartevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [5]:
import os
import pathlib
import pandas as pd

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [6]:
path_mimic = p_project + '/data/mimic4'

In [7]:
adm = pd.read_csv(path_mimic + '/processed/admissions_processed.csv')

In [8]:
# only choose previously selected admission ids
adm_ids=list(adm['hadm_id'])
charts = pd.DataFrame()
for chunk in pd.read_csv(path_mimic + '/raw/icu/chartevents.csv.gz', chunksize=10000000, low_memory=False):
    chunk=chunk.loc[chunk['hadm_id'].isin(adm_ids)]
    charts = pd.concat([charts, chunk[['subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum', 'valueuom']]])
    
charts = charts.reset_index(drop=True)

In [9]:
# only choose previously selected admission ids.
print('Number of patients remaining in the database: ')
print(charts['subject_id'].nunique())

Number of patients remaining in the database: 
42087


In [10]:
# get item ids
item_id=pd.read_csv(path_mimic + '/raw/icu/d_items.csv.gz')[['itemid','label']]

# get names of administered items
charts2=pd.merge(charts, item_id, on='itemid')
charts2.head()

Unnamed: 0,subject_id,hadm_id,charttime,itemid,valuenum,valueuom,label
0,10001217,24597018,2157-11-21 19:00:00,220045,101.0,bpm,Heart Rate
1,10001217,24597018,2157-11-21 20:00:00,220045,96.0,bpm,Heart Rate
2,10001217,24597018,2157-11-21 21:00:00,220045,93.0,bpm,Heart Rate
3,10001217,24597018,2157-11-21 00:00:00,220045,98.0,bpm,Heart Rate
4,10001217,24597018,2157-11-21 08:00:00,220045,85.0,bpm,Heart Rate


In [11]:

pat_for_item=charts2.groupby('label')['subject_id'].nunique().sort_values(ascending=False)

label_counts=charts2['label'].value_counts()

In [12]:
# get only top 50 most used tests
frequent_labels1=pat_for_item[:100]
frequent_labels2=label_counts.head(200)

fre_labels = frequent_labels1.loc[frequent_labels1.index.isin(frequent_labels2.index)]

In [13]:
fre_labels.index

Index(['Heart Rate', 'O2 saturation pulseoxymetry', 'GCS - Eye Opening',
       'GCS - Verbal Response', 'GCS - Motor Response', 'Alarms On',
       'Head of Bed', 'Heart rate Alarm - High', 'Heart Rate Alarm - Low',
       'O2 Saturation Pulseoxymetry Alarm - Low', 'Respiratory Rate',
       'Skin Integrity', 'RUL Lung Sounds', 'LUL Lung Sounds',
       'O2 Saturation Pulseoxymetry Alarm - High', 'RLL Lung Sounds',
       'Skin Temperature', 'LLL Lung Sounds', 'Resp Alarm - High',
       'Resp Alarm - Low', 'Skin Condition', 'Turn', 'Braden Mobility',
       'Braden Activity', 'Braden Sensory Perception', 'Braden Moisture',
       'Abdominal Assessment', 'Braden Nutrition', 'Braden Friction/Shear',
       'Heart Rhythm', 'Parameters Checked', 'Bowel Sounds', 'Skin Color',
       'Oral Cavity', 'Activity Tolerance', 'Temperature Site',
       'SpO2 Desat Limit', 'Urine Source', 'Diet Type',
       'Pain Assessment Method', 'Potassium (serum)', 'IV/Saline lock',
       'Gait/Transferrin

In [14]:
mask_df = charts2['label'].isin(fre_labels.index)

In [15]:
mask_df.sum()

121115943

In [16]:
charts3=charts2.loc[mask_df]

In [17]:
charts3.shape

(121115943, 7)

In [18]:
charts3.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charts3.dropna(inplace=True)


In [19]:
charts3.shape

(35019071, 7)

In [20]:
charts3.to_csv(path_mimic + '/processed/charts_processed.csv')

In [21]:
charts3['label'].unique()

array(['Heart Rate', 'Non Invasive Blood Pressure systolic',
       'Non Invasive Blood Pressure diastolic',
       'Non Invasive Blood Pressure mean', 'Respiratory Rate',
       'O2 saturation pulseoxymetry', 'Temperature Fahrenheit',
       'Heart rate Alarm - High', 'Heart Rate Alarm - Low',
       'Non-Invasive Blood Pressure Alarm - High',
       'Non-Invasive Blood Pressure Alarm - Low',
       'O2 Saturation Pulseoxymetry Alarm - High',
       'O2 Saturation Pulseoxymetry Alarm - Low', 'Resp Alarm - High',
       'Resp Alarm - Low', 'SpO2 Desat Limit', 'Potassium (serum)'],
      dtype=object)

In [22]:
itemid_valueuom = charts3.groupby(["itemid"])["valueuom"].unique()

In [23]:
itemid_valueuom

itemid
220045         [bpm]
220046         [bpm]
220047         [bpm]
220179        [mmHg]
220180        [mmHg]
220181        [mmHg]
220210    [insp/min]
220277           [%]
223751        [mmHg]
223752        [mmHg]
223761          [°F]
223769           [%]
223770           [%]
224161    [insp/min]
224162    [insp/min]
226253           [%]
227442       [mEq/L]
Name: valueuom, dtype: object

: 