# eICU Dataset Exploratory Analysis

This notebook loads the `eICU_data.csv.gz` file and provides a brief overview of its structure and key clinical features.


In [1]:
import pandas as pd
import pathlib


In [2]:
data_path = pathlib.Path('../../data/raw/eICU_data.csv.gz')
# Load compressed CSV
data = pd.read_csv(data_path)
print(data.shape)
data.head()

(1206142, 28)


Unnamed: 0,hospital_id,patient_id,hospitalization_id,recorded_dttm,ARDS_onset_time,time_from_ARDS_onset,APACHE,sex,age_at_admission,ethinicity,...,height_cm,weight_kg,nmb_used,cisatracurium_dose,vecuronium_dose,rocuronium_dose,atracurium_dose,pancuronium_dose,prone_flag,new_tracheostomy
0,2000623,11227959.0,9462077,2023-01-01 03:13:00,0,193.0,,Female,88,Caucasian,...,159.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,2000623,11227959.0,9462077,2023-01-01 06:10:00,0,370.0,,Female,88,Caucasian,...,159.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2000623,11227959.0,9462077,2023-01-01 10:03:00,0,603.0,,Female,88,Caucasian,...,159.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2000623,11227959.0,9462077,2023-01-01 11:13:00,0,673.0,,Female,88,Caucasian,...,159.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,2000623,11227959.0,9462077,2023-01-01 11:21:00,0,681.0,,Female,88,Caucasian,...,159.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


## Column completeness

In [3]:
data.isnull().mean().sort_values(ascending=False).head(20)

pao2                    9.388505e-01
lmp_set                 9.345624e-01
spo2                    7.272245e-01
peep                    6.625820e-01
fio2_set                2.667140e-01
APACHE                  1.919641e-01
respiratory_device      1.072187e-01
ethinicity              8.230374e-03
height_cm               3.322163e-03
weight_kg               1.026413e-03
disposition_category    8.290898e-07
recorded_dttm           0.000000e+00
age_at_admission        0.000000e+00
ARDS_onset_time         0.000000e+00
time_from_ARDS_onset    0.000000e+00
sex                     0.000000e+00
hospital_id             0.000000e+00
patient_id              0.000000e+00
hospitalization_id      0.000000e+00
ecmo_flag               0.000000e+00
dtype: float64

## Patient and hospitalisation counts

In [4]:
n_patients = data['patient_id'].nunique()
n_hosp = data['hospitalization_id'].nunique()
{'unique_patients': n_patients, 'unique_hospitalizations': n_hosp}

{'unique_patients': 16269, 'unique_hospitalizations': 15498}

## Proning and outcomes

In [5]:
proning = data.groupby('hospitalization_id')['prone_flag'].max().value_counts()
proning

prone_flag
0    14660
1      838
Name: count, dtype: int64

In [6]:
outcomes = data['disposition_category'].value_counts()
outcomes.head()

disposition_category
Alive      886625
Expired    319516
Name: count, dtype: int64