In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
from cleaning_utils import *

In [None]:
raw_dir = 'data/raw'
import os 


check_raw_data()

## Read data 
labs = pd.read_csv(raw_dir + '/LABEVENTS.csv')
labs = standardise_col_names(labs)

In [None]:
## Check data
print(labs.shape)
labs.head()

In [None]:
## Check dtypes
labs.dtypes

In [None]:
## Check IDs 
id_cols = [col for col in labs.columns if col.endswith('id')]

for id_col in id_cols:
    print(f'Number of NaNs for {id_col}: {labs[id_col].isna().sum()}')
    print(f'Number of unique values for {id_col}: {labs[id_col].nunique()}')
    print('===========')
    

Missing admission ids are probably due to outpatient entries

In [None]:
## Clean 

## Drop admission id NANs since we are focused on hospital cases
print(labs.shape[0])
labs_clean = labs.dropna(subset=['hadm_id'])
print(labs_clean.shape[0])

## Set time to datetime 
labs_clean.loc[:,'charttime'] = pd.to_datetime(labs_clean.charttime)

In [None]:
## Use DUCKDB to extract relevant lab events 
import duckdb

conn = duckdb.connect()

conn.register('labs_clean', labs_clean )

mastersheet = conn.read_parquet('data/curated/icu_admissions_patients_charlson_diagnoses.parquet')

Code reference: [mimic github](https://github.com/MIT-LCP/mimic-code/blob/main/mimic-iii/notebooks/first_labs.ipynb)
* mapping from ITEMID to ITEM NAME

In [None]:
## Use SQL to merge labs into the master sheet, 
## then filter only for some lab events 
## and filter for lab events in the first 24 hours of ICU stay 

## then, for each lab for each icu, we take the average (e.g. for a particular icu, take the bicarbonate mean so that it is one icu - one lab value)

sql_filtered_lab_events = '''
with RAW as ( -- raw data + add mapping for itemid and filter for relevant itemids
    SELECT 
        icu.subject_id, 
        icu.hadm_id, 
        icu.icustay_id, 
        le.charttime,
    
        -- create mapping for item id (so that item is more readable)
          CASE
            WHEN le.itemid IN (50882) THEN 'BICARBONATE'
            WHEN le.itemid IN (50885) THEN 'BILIRUBIN'
            WHEN le.itemid IN (50912) THEN 'CREATININE'
            WHEN le.itemid IN (50902,50806) THEN 'CHLORIDE'
            WHEN le.itemid IN (50809,50931) THEN 'GLUCOSE'
            WHEN le.itemid IN (50810,51221) THEN 'HEMATOCRIT'
            WHEN le.itemid IN (50811,51222) THEN 'HEMOGLOBIN'
            WHEN le.itemid IN (50813) THEN 'LACTATE'
            WHEN le.itemid IN (50960) THEN 'MAGNESIUM'
            WHEN le.itemid IN (50970) THEN 'PHOSPHATE'
            WHEN le.itemid IN (51265) THEN 'PLATELET'
            WHEN le.itemid IN (50822,50971) THEN 'POTASSIUM'
            WHEN le.itemid IN (50824,50983) THEN 'SODIUM'
            WHEN le.itemid IN (51006) THEN 'BUN'
            WHEN le.itemid IN (51300,51301) THEN 'WBC'
            ELSE NULL
          END AS label,   
    
          le.valuenum, 
          dense_rank() over (partition by icustay_id, label order by charttime) as nth_lab_measurement 
    
    FROM labs_clean le
    JOIN mastersheet icu 
        on icu.subject_id = le.subject_id
        and icu.hadm_id = le.hadm_id
        and le.charttime 
            BETWEEN icu.intime - interval '6' hour
            AND icu.intime + interval '24' hour
    -- reduce number of rows
    WHERE 
        le.itemid IN (50882,50885,50912,50902,50806,50809,50931,50810,51221,50811,51222,
          50813,50960,50970,51265,50822,50971,50824,50983,51006,51300,51301
        )
        AND le.valuenum IS NOT NULL -- remove useless info
        AND le.valuenum > 0 -- not possible
    )

SELECT 
    icustay_id, label, 
        avg(valuenum) as mean_val, 
        min(valuenum) as min_val, 
        max(valuenum) as max_val, 
        count(*) as count_val
FROM raw 
WHERE label is not null 
GROUP BY icustay_id, label 
'''

lab_aggregates = conn.execute(sql_filtered_lab_events).df()

# conn.sql(" select * from labs_clean limit 5")
# conn.sql(" select * from mastersheet limit 5")

In [None]:
## check
lab_aggregates.head()

In [None]:
def pivot_to_wide(col_values, lab_aggregates):
    if col_values not in lab_aggregates.columns:
        print('Column name misspelled or not in df')
        return 0
    return lab_aggregates.pivot(index = 'icustay_id', 
                     columns= 'label', ## the one col that u want to turn into many cols 
                     values = col_values).add_prefix(col_values + '_')
    

## Pivot to wide format -- test 
pivot_to_wide('mean_val', lab_aggregates)


## Transform -- merge lab aggregates into mastersheet

In [None]:
mastersheet = pd.read_parquet('data/curated/icu_admissions_patients_charlson_diagnoses.parquet')

In [None]:
labs_pivot_mean = pivot_to_wide('mean_val', lab_aggregates)
labs_pivot_min = pivot_to_wide('min_val', lab_aggregates)
labs_pivot_max = pivot_to_wide('max_val', lab_aggregates)
labs_pivot_count = pivot_to_wide('count_val', lab_aggregates)


mastersheet_with_lab_aggs = (
    mastersheet
    .merge(labs_pivot_mean, on="icustay_id", how="left")
    .merge(labs_pivot_min, on="icustay_id", how="left")
    .merge(labs_pivot_max, on="icustay_id", how="left")
    .merge(labs_pivot_count, on="icustay_id", how="left")
)


In [None]:
## check
mastersheet_with_lab_aggs.head()

In [None]:
mastersheet_with_lab_aggs.to_parquet('data/curated/icu_admissions_patients_charlson_diagnoses_labs')

In [None]:
mastersheet_with_lab_aggs.columns.tolist()