In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from cleaning_utils import *

In [9]:
raw_dir = 'data/raw'
import os 


check_raw_data()

## Read data 
labs = pd.read_csv(raw_dir + '/LABEVENTS.csv')
labs = standardise_col_names(labs)

['ADMISSIONS.csv', 'DIAGNOSES_ICD.csv', 'ICUSTAYS.csv', 'LABEVENTS.csv', 'PATIENTS.csv']


In [14]:
## Check data
print(labs.shape)
labs.head()

(27854055, 9)


Unnamed: 0,row_id,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag
0,281,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,282,3,,50800,2101-10-12 18:17:00,ART,,,
2,283,3,,50802,2101-10-12 18:17:00,-1,-1.0,mEq/L,
3,284,3,,50804,2101-10-12 18:17:00,22,22.0,mEq/L,
4,285,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal


In [18]:
## Check dtypes
labs.dtypes

row_id          int64
subject_id      int64
hadm_id       float64
itemid          int64
charttime      object
value          object
valuenum      float64
valueuom       object
flag           object
dtype: object

In [16]:
## Check IDs 
id_cols = [col for col in labs.columns if col.endswith('id')]

for id_col in id_cols:
    print(f'Number of NaNs for {id_col}: {labs[id_col].isna().sum()}')
    print(f'Number of unique values for {id_col}: {labs[id_col].nunique()}')
    print('===========')
    

Number of NaNs for row_id: 0
Number of unique values for row_id: 27854055
Number of NaNs for subject_id: 0
Number of unique values for subject_id: 46252
Number of NaNs for hadm_id: 5609021
Number of unique values for hadm_id: 58151
Number of NaNs for itemid: 0
Number of unique values for itemid: 726


Missing admission ids are probably due to outpatient entries

In [23]:
## Clean 

## Drop admission id NANs since we are focused on hospital cases
print(labs.shape[0])
labs_clean = labs.dropna(subset=['hadm_id'])
print(labs_clean.shape[0])

## Set time to datetime 
labs_clean.loc[:,'charttime'] = pd.to_datetime(labs_clean.charttime)

27854055
22245034


In [27]:
## Use DUCKDB to extract relevant lab events 
import duckdb

conn = duckdb.connect()

conn.register('labs_clean', labs_clean )

mastersheet = conn.read_parquet('data/curated/icu_admissions_patients_charlson_diagnoses.parquet')

Code reference: [mimic github](https://github.com/MIT-LCP/mimic-code/blob/main/mimic-iii/notebooks/first_labs.ipynb)
* mapping from ITEMID to ITEM NAME

In [37]:
## Use SQL to merge labs into the master sheet, 
## then filter only for some lab events 
## and filter for lab events in the first 24 hours of ICU stay 

## then, for each lab for each icu, we take the average (e.g. for a particular icu, take the bicarbonate mean so that it is one icu - one lab value)

sql_filtered_lab_events = '''
with RAW as ( -- raw data + add mapping for itemid and filter for relevant itemids
    SELECT 
        icu.subject_id, 
        icu.hadm_id, 
        icu.icustay_id, 
        le.charttime,
    
        -- create mapping for item id (so that item is more readable)
          CASE
            WHEN le.itemid IN (50882) THEN 'BICARBONATE'
            WHEN le.itemid IN (50885) THEN 'BILIRUBIN'
            WHEN le.itemid IN (50912) THEN 'CREATININE'
            WHEN le.itemid IN (50902,50806) THEN 'CHLORIDE'
            WHEN le.itemid IN (50809,50931) THEN 'GLUCOSE'
            WHEN le.itemid IN (50810,51221) THEN 'HEMATOCRIT'
            WHEN le.itemid IN (50811,51222) THEN 'HEMOGLOBIN'
            WHEN le.itemid IN (50813) THEN 'LACTATE'
            WHEN le.itemid IN (50960) THEN 'MAGNESIUM'
            WHEN le.itemid IN (50970) THEN 'PHOSPHATE'
            WHEN le.itemid IN (51265) THEN 'PLATELET'
            WHEN le.itemid IN (50822,50971) THEN 'POTASSIUM'
            WHEN le.itemid IN (50824,50983) THEN 'SODIUM'
            WHEN le.itemid IN (51006) THEN 'BUN'
            WHEN le.itemid IN (51300,51301) THEN 'WBC'
            ELSE NULL
          END AS label,   
    
          le.valuenum, 
          dense_rank() over (partition by icustay_id, label order by charttime) as nth_lab_measurement 
    
    FROM labs_clean le
    JOIN mastersheet icu 
        on icu.subject_id = le.subject_id
        and icu.hadm_id = le.hadm_id
        and le.charttime 
            BETWEEN icu.intime - interval '6' hour
            AND icu.intime + interval '24' hour
    -- reduce number of rows
    WHERE 
        le.itemid IN (50882,50885,50912,50902,50806,50809,50931,50810,51221,50811,51222,
          50813,50960,50970,51265,50822,50971,50824,50983,51006,51300,51301
        )
        AND le.valuenum IS NOT NULL -- remove useless info
        AND le.valuenum > 0 -- not possible
    )

SELECT 
    icustay_id, label, 
        avg(valuenum) as mean_val, 
        min(valuenum) as min_val, 
        max(valuenum) as max_val, 
        count(*) as count_val
FROM raw 
WHERE label is not null 
GROUP BY icustay_id, label 
'''

lab_aggregates = conn.execute(sql_filtered_lab_events).df()

# conn.sql(" select * from labs_clean limit 5")
# conn.sql(" select * from mastersheet limit 5")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [38]:
## check
lab_aggregates.head()

Unnamed: 0,icustay_id,label,mean_val,min_val,max_val,count_val
0,201351,WBC,9.1,9.1,9.1,1
1,201695,BICARBONATE,11.428571,6.0,19.0,7
2,203625,SODIUM,141.5,140.0,143.0,4
3,205928,BICARBONATE,15.571429,9.0,20.0,7
4,206287,HEMATOCRIT,38.8,38.8,38.8,1


In [41]:
def pivot_to_wide(col_values, lab_aggregates):
    if col_values not in lab_aggregates.columns:
        print('Column name misspelled or not in df')
        return 0
    return lab_aggregates.pivot(index = 'icustay_id', 
                     columns= 'label', ## the one col that u want to turn into many cols 
                     values = col_values).add_prefix(col_values + '_')
    

## Pivot to wide format -- test 
pivot_to_wide('mean_val', lab_aggregates)


label,mean_val_BICARBONATE,mean_val_BILIRUBIN,mean_val_BUN,mean_val_CHLORIDE,mean_val_CREATININE,mean_val_GLUCOSE,mean_val_HEMATOCRIT,mean_val_HEMOGLOBIN,mean_val_LACTATE,mean_val_MAGNESIUM,mean_val_PHOSPHATE,mean_val_PLATELET,mean_val_POTASSIUM,mean_val_SODIUM,mean_val_WBC
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
200001,28.000000,0.2,83.000000,101.000000,2.700000,87.000000,24.850000,7.550000,1.400000,2.000000,5.000000,137.500000,4.450000,139.000000,2.850000
200003,22.000000,3.6,20.000000,105.000000,0.866667,119.666667,33.433333,11.150000,3.933333,1.866667,3.366667,111.666667,3.133333,141.666667,32.966667
200006,31.500000,,13.500000,95.000000,0.950000,148.500000,28.666667,9.966667,3.100000,1.600000,3.000000,225.333333,3.933333,136.000000,7.666667
200007,23.000000,,10.333333,102.666667,0.833333,242.000000,41.250000,14.400000,,1.800000,2.400000,255.500000,4.066667,137.666667,9.550000
200009,22.500000,,15.500000,112.400000,0.500000,113.600000,27.914286,9.446154,2.000000,,,150.166667,4.540000,138.833333,12.416667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299993,30.666667,0.5,13.333333,98.000000,0.500000,106.333333,29.233333,10.300000,1.000000,2.100000,1.833333,277.000000,3.866667,134.666667,12.100000
299994,24.666667,,44.000000,102.666667,4.466667,137.333333,27.383333,9.733333,,1.750000,4.500000,275.000000,5.700000,137.000000,8.766667
299995,25.000000,0.3,8.000000,106.000000,0.850000,116.000000,40.750000,14.700000,2.000000,1.700000,3.500000,250.000000,3.660000,139.666667,22.500000
299998,23.000000,,21.000000,107.666667,1.066667,170.000000,32.716667,10.866667,2.933333,1.800000,3.550000,197.666667,4.150000,138.166667,9.500000


## Transform -- merge lab aggregates into mastersheet

In [44]:
mastersheet = pd.read_parquet('data/curated/icu_admissions_patients_charlson_diagnoses.parquet')

In [46]:
labs_pivot_mean = pivot_to_wide('mean_val', lab_aggregates)
labs_pivot_min = pivot_to_wide('min_val', lab_aggregates)
labs_pivot_max = pivot_to_wide('max_val', lab_aggregates)
labs_pivot_count = pivot_to_wide('count_val', lab_aggregates)


mastersheet_with_lab_aggs = (
    mastersheet
    .merge(labs_pivot_mean, on="icustay_id", how="left")
    .merge(labs_pivot_min, on="icustay_id", how="left")
    .merge(labs_pivot_max, on="icustay_id", how="left")
    .merge(labs_pivot_count, on="icustay_id", how="left")
)


In [47]:
## check
mastersheet_with_lab_aggs.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,...,count_val_GLUCOSE,count_val_HEMATOCRIT,count_val_HEMOGLOBIN,count_val_LACTATE,count_val_MAGNESIUM,count_val_PHOSPHATE,count_val_PLATELET,count_val_POTASSIUM,count_val_SODIUM,count_val_WBC
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,...,5.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0
1,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,...,4.0,4.0,4.0,9.0,4.0,3.0,4.0,4.0,4.0,5.0
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,...,2.0,2.0,2.0,,1.0,1.0,2.0,2.0,2.0,2.0
3,368,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,...,4.0,5.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0
4,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,...,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0


In [51]:
mastersheet_with_lab_aggs.to_parquet('data/curated/icu_admissions_patients_charlson_diagnoses_labs')

In [50]:
mastersheet_with_lab_aggs.columns.tolist()

['row_id',
 'subject_id',
 'hadm_id',
 'icustay_id',
 'dbsource',
 'first_careunit',
 'last_careunit',
 'first_wardid',
 'last_wardid',
 'intime',
 'outtime',
 'los',
 'nth_visit_per_patient_admission',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'religion',
 'marital_status',
 'ethnicity',
 'edregtime',
 'edouttime',
 'diagnosis',
 'hospital_expire_flag',
 'has_chartevents_data',
 'ethnicity_categorised',
 'los_admission',
 'gender',
 'dob',
 'dod',
 'dod_hosp',
 'dod_ssn',
 'expire_flag',
 'DOD_consolidated',
 'age_during_admission',
 'age_group',
 'nth_visit',
 'latest_visit',
 'time_to_icu',
 'lagged_icu_outtime',
 'icu_mortality',
 'charlson_category',
 'admission_CCI',
 'mean_val_BICARBONATE',
 'mean_val_BILIRUBIN',
 'mean_val_BUN',
 'mean_val_CHLORIDE',
 'mean_val_CREATININE',
 'mean_val_GLUCOSE',
 'mean_val_HEMATOCRIT',
 'mean_val_HEMOGLOBIN',
 'mean_val_LACTATE',
 'mean_val_MAGNESIUM',
 