# exploration of the data

in this notebook we'll take a look at the gathered data from the MIMIC-III sql requests as well as the eICU sql requests.

This data is gathered to be able to train a ML model to be able to predict AKI


In [1]:
# imports
import plotly.express as px
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
mimic_dir = Path(Path.cwd() / 'data' / 'mimiciii' )
mimic_query = Path(mimic_dir / 'queried')
mimic_prepro = Path(mimic_dir / 'preprocessed')
eicu_dir = Path(Path.cwd() / 'data' / 'eicu')
eicu_query = Path(eicu_dir / 'queried')
eicu_prepro = Path(eicu_dir / 'preprocessed')
mimic_dir

PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii')

In [3]:
# list contents and add them in a dict for easier processing:
def get_files(database_dir):
    files_dirs = [x for x in database_dir.glob('**/*.parquet') if x.is_file()]
    files_names = [x.stem for x in files_dirs]
    files = dict(zip(files_names, files_dirs))
    return files
files_mimic = get_files(mimic_query)
files_eicu = get_files(eicu_query)
files_mimic

{'chart_vitals_stay': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/chart_vitals_stay.parquet'),
 'DIAGNOSES_ICD': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/DIAGNOSES_ICD.parquet'),
 'AKI_KIDIGO_STAGES_SQL_CREATININE': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/AKI_KIDIGO_STAGES_SQL_CREATININE.parquet'),
 'ADMISSIONS': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/ADMISSIONS.parquet'),
 'comorbidities': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/comorbidities.parquet'),
 'AKI_KIDIGO_STAGES_SQL': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/AKI_KIDIGO_STAGES_SQL.parquet'),
 'AKI_KIDIGO_7D_SQL': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/AKI_KIDIGO_7D_SQL.parquet'),
 'labstay': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/queried/labstay.parquet'),
 'AKI_KIDIGO_7D_SQL_CREATININE': PosixPath('/Users/wdevries/GIT/Aki-

In [4]:
files_eicu

{'chart_vitals_stay': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/chart_vitals_stay.parquet'),
 'DIAGNOSES_ICD': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/DIAGNOSES_ICD.parquet'),
 'AKI_KIDIGO_STAGES_SQL_CREATININE': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/AKI_KIDIGO_STAGES_SQL_CREATININE.parquet'),
 'ADMISSIONS': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/ADMISSIONS.parquet'),
 'comorbidities': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/comorbidities.parquet'),
 'AKI_KIDIGO_STAGES_SQL': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/AKI_KIDIGO_STAGES_SQL.parquet'),
 'AKI_KIDIGO_7D_SQL': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/AKI_KIDIGO_7D_SQL.parquet'),
 'labstay': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/labstay.parquet'),
 'AKI_KIDIGO_7D_SQL_CREATININE': PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/eicu/queried/AKI_

## Dataframes

We'll import the different dataset in pandas dataframes, to be able to visualise them easily:


In [5]:
def create_dataframe_dict(parquetdict: dict):
    df_dict = dict()
    df_column_names = dict()
    for k,v in parquetdict.items():
        df_dict[k]=pd.read_parquet(v)
        df_column_names[k] = df_dict[k].columns
    return df_dict,df_column_names
mimiciii_df_dict, mimic_column_names = create_dataframe_dict(files_mimic)
eicu_df_dict, eicu_column_names = create_dataframe_dict(files_eicu)

In [6]:
mimiciii_df_dict['chart_vitals_stay'][['diasbp_min','diasbp_mean','diasbp_max']]

Unnamed: 0,diasbp_min,diasbp_mean,diasbp_max
0,,,
1,24.0,58.643678,103.0
2,55.0,71.461538,86.0
3,,,
4,40.0,60.010417,88.0
...,...,...,...
60280,43.0,58.100592,111.0
60281,65.0,83.274510,172.0
60282,38.0,54.613636,69.0
60283,39.0,51.258621,95.0


In [7]:
eicu_df_dict['chart_vitals_stay'][['diasbp_min','diasbp_mean','diasbp_max']]

Unnamed: 0,diasbp_min,diasbp_mean,diasbp_max
0,17.0,51.000000,84.0
1,,,
2,,,
3,30.0,50.636628,128.0
4,,,
...,...,...,...
200854,,,
200855,,,
200856,0.0,56.982063,121.0
200857,,,


In [8]:
mimic_column_names

{'chart_vitals_stay': Index(['subject_id', 'hadm_id', 'icustay_id', 'heartrate_min', 'heartrate_max',
        'heartrate_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean', 'diasbp_min',
        'diasbp_max', 'diasbp_mean', 'meanbp_min', 'meanbp_max', 'meanbp_mean',
        'resprate_min', 'resprate_max', 'resprate_mean', 'tempc_min',
        'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max', 'spo2_mean',
        'glucose_min', 'glucose_max', 'glucose_mean'],
       dtype='object'),
 'DIAGNOSES_ICD': Index(['subject_id', 'hadm_id', 'icd9_code', 'seq_num'], dtype='object'),
 'AKI_KIDIGO_STAGES_SQL_CREATININE': Index(['icustay_id', 'charttime', 'creat', 'aki_stage_creat', 'aki_stage'], dtype='object'),
 'ADMISSIONS': Index(['subject_id', 'hadm_id', 'icustay_id', 'admittime', 'dischtime',
        'ethnicity', 'intime', 'outtime', 'los', 'gender', 'dob', 'staytime',
        'age', 'timegoicu', 'timeinicu', 'timeaftergoicu', 'counttimesgoicu'],
       dtype='object'),
 'comorbidities': Index(['h

## visualize parameters


### after postgres


In [9]:
i="chart_vitals_stay"
j="diasbp_mean"
amount_of_data_mimic = mimiciii_df_dict[i][j].count()
amount_of_data_eicu = eicu_df_dict[i][j].count()
fig = make_subplots(rows=1, cols=2, subplot_titles=("mimic data amount: {}".format(amount_of_data_mimic),"eicu data, amount: {}".format(amount_of_data_eicu)))
fig.add_trace(go.Box(y=mimiciii_df_dict[i][j], name='mimic',boxpoints=False,), row=1, col=1)
fig.add_trace(go.Box(y=eicu_df_dict[i][j], name='eicu',boxpoints=False,), row=1, col=2)
fig.update_layout(
    yaxis_title=j,
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()

In [11]:
def create_plots(df_mimic, df_eicu):
    x = 'icustay_id'
    columns_to_skip = ['hadm_id','icustay_id','charttime','subject_id','seq_num','admittime']
    root_dir = Path(Path.cwd() / 'data'/ 'comparison')
    try:
        root_dir.mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        print("Folder is already there")
    else:
        print("Folder was created").mkdir(parents=True, exist_ok=True)
    
    for i in df_mimic:
        for j in df_mimic[i]:
            print(i,j)
            if j not in (columns_to_skip):
                if(i=='comorbidities'):
                    x='hadm_id'
                else:
                    x='icustay_id'
                if j not in df_mimic[i].columns or j not in df_eicu[i].columns:
                    print("skipping {}".format(j))
                    continue
                amount_of_data_mimic = df_mimic[i][j].count()
                amount_of_data_eicu = df_eicu[i][j].count()
                fig = make_subplots(rows=1, cols=2, subplot_titles=("mimic data amount: {}".format(amount_of_data_mimic),"eicu data, amount: {}".format(amount_of_data_eicu)))
                fig.add_trace(go.Box(y=df_mimic[i][j], name='mimic',boxpoints=False,), row=1, col=1)
                fig.add_trace(go.Box(y=df_eicu[i][j], name='eicu',boxpoints=False,), row=1, col=2)
                fig.update_layout(
                    yaxis_title=j,
                    boxmode='group' # group together boxes of the different traces for each value of x
                )
                # fig.write_html(root_dir / str(i+"_"+j+".html"))
                fig.write_image(root_dir / str(i+"_"+j+".jpeg"))
create_plots(mimiciii_df_dict, eicu_df_dict)

Folder is already there
chart_vitals_stay subject_id
chart_vitals_stay hadm_id
chart_vitals_stay icustay_id
chart_vitals_stay heartrate_min
chart_vitals_stay heartrate_max
chart_vitals_stay heartrate_mean
chart_vitals_stay sysbp_min
chart_vitals_stay sysbp_max
chart_vitals_stay sysbp_mean
chart_vitals_stay diasbp_min
chart_vitals_stay diasbp_max
chart_vitals_stay diasbp_mean
chart_vitals_stay meanbp_min
chart_vitals_stay meanbp_max
chart_vitals_stay meanbp_mean
chart_vitals_stay resprate_min
chart_vitals_stay resprate_max
chart_vitals_stay resprate_mean
chart_vitals_stay tempc_min
chart_vitals_stay tempc_max
chart_vitals_stay tempc_mean
chart_vitals_stay spo2_min
chart_vitals_stay spo2_max
chart_vitals_stay spo2_mean
chart_vitals_stay glucose_min
chart_vitals_stay glucose_max
chart_vitals_stay glucose_mean
DIAGNOSES_ICD subject_id
DIAGNOSES_ICD hadm_id
DIAGNOSES_ICD icd9_code
DIAGNOSES_ICD seq_num
AKI_KIDIGO_STAGES_SQL_CREATININE icustay_id
AKI_KIDIGO_STAGES_SQL_CREATININE charttime
AK

### After preprocessing


In [12]:
eicu_prepro_files = get_files(eicu_prepro)
mimic_prepro_files = get_files(mimic_prepro)
type(mimic_prepro_files)
mimic_prepro_files['INFO_DATASET_7days_creatinine2']

PosixPath('/Users/wdevries/GIT/Aki-Predictor/data/mimiciii/preprocessed/INFO_DATASET_7days_creatinine2.parquet')

In [13]:
mimic_prepro_df_dict, mimic_prepro_column_names = create_dataframe_dict(mimic_prepro_files)
eicu_prepro_df_dict,eicu_prepro_column_names = create_dataframe_dict(eicu_prepro_files)
mimic_prepro_df_dict['INFO_DATASET_7days_creatinine2']

Unnamed: 0_level_0,SUBJECT_ID_x,HADM_ID_x,ICUSTAY_ID,ADMITTIME,DISCHTIME,ETHNICITY,INTIME,OUTTIME,LOS,GENDER,...,PERIPHERAL_VASCULAR,RENAL_FAILURE,HYPERTENSION,DIABETES_UNCOMPLICATED,DIABETES_COMPLICATED,HYPOTHYROIDISM,LIVER_DISEASE,OBESITY,ALCOHOL_ABUSE,DRUG_ABUSE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,145834,211552,2101-10-20 19:08:00,2101-10-31 13:58:00,WHITE,2101-10-20 19:10:11,2101-10-26 20:43:09,6.0646,M,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,185777,294638,2191-03-16 00:28:00,2191-03-23 18:41:00,WHITE,2191-03-16 00:29:31,2191-03-17 16:46:31,1.6785,F,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,6,107064,228232,2175-05-30 07:15:00,2175-06-15 16:00:00,WHITE,2175-05-30 21:30:54,2175-06-03 13:39:54,3.6729,F,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,150750,220597,2149-11-09 13:06:00,2149-11-14 10:15:00,UNKNOWN/NOT SPECIFIED,2149-11-09 13:07:02,2149-11-14 20:52:14,5.3231,M,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,194540,229441,2178-04-16 06:18:00,2178-05-11 19:00:00,WHITE,2178-04-16 06:19:32,2178-04-17 20:21:05,1.5844,F,...,,,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53510,99985,176670,279638,2181-01-27 02:47:00,2181-02-12 17:05:00,WHITE,2181-01-29 05:33:34,2181-02-09 12:45:20,11.2998,M,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
53511,99991,151118,226241,2184-12-24 08:30:00,2185-01-05 12:15:00,WHITE,2184-12-28 17:30:58,2184-12-31 20:56:20,3.1426,M,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
53512,99992,197084,242052,2144-07-25 18:03:00,2144-07-28 17:56:00,WHITE,2144-07-25 18:04:42,2144-07-27 17:27:55,1.9745,F,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53513,99995,137810,229633,2147-02-08 08:00:00,2147-02-11 13:15:00,WHITE,2147-02-08 13:53:58,2147-02-10 17:46:30,2.1615,F,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
def create_plots_prepro(df_mimic_dict:dict, df_eicu_dict:dict):
    x = 'icustay_id'
    columns_to_skip = ['hadm_id','icustay_id','charttime','subject_id','seq_num','admittime']
    root_dir = Path(Path.cwd() / 'data'/ 'comparison'/'preprocessing')
    try:
        root_dir.mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        print("Folder is already there")
    else:
        print("Folder was created")
    
    for i in df_mimic_dict:
        # print(i)
        for j in df_mimic_dict[i]:
            print("{} ----> {}".format(i,j))
            if j not in (columns_to_skip):
                if(i=='comorbidities'):
                    x='hadm_id'
                else:
                    x='icustay_id'
                if j not in df_mimic_dict[i].columns or j not in df_eicu_dict[i].columns:
                    print("skipping {}".format(j))
                    continue
                amount_of_data_mimic = df_mimic_dict[i][j].count()
                amount_of_data_eicu = df_eicu_dict[i][j].count()
                fig = make_subplots(rows=1, cols=2, subplot_titles=("mimic data amount: {}".format(amount_of_data_mimic),"eicu data, amount: {}".format(amount_of_data_eicu)))
                fig.add_trace(go.Box(y=df_mimic_dict[i][j], name='mimic',boxpoints=False,), row=1, col=1)
                fig.add_trace(go.Box(y=df_eicu_dict[i][j], name='eicu',boxpoints=False,), row=1, col=2)
                fig.update_layout(
                    yaxis_title=j,
                    boxmode='group' # group together boxes of the different traces for each value of x
                )
                # fig.write_html(root_dir / str(i+"_"+j+".html"))
                fig.write_image(root_dir / str(i+"_"+j+".jpeg"))
create_plots_prepro(df_mimic_dict=mimic_prepro_df_dict, df_eicu_dict=eicu_prepro_df_dict)

Folder is already there
INFO_DATASET_7days_creatinine2 ----> SUBJECT_ID_x
INFO_DATASET_7days_creatinine2 ----> HADM_ID_x
INFO_DATASET_7days_creatinine2 ----> ICUSTAY_ID
INFO_DATASET_7days_creatinine2 ----> ADMITTIME
skipping ADMITTIME
INFO_DATASET_7days_creatinine2 ----> DISCHTIME
skipping DISCHTIME
INFO_DATASET_7days_creatinine2 ----> ETHNICITY
INFO_DATASET_7days_creatinine2 ----> INTIME
skipping INTIME
INFO_DATASET_7days_creatinine2 ----> OUTTIME
skipping OUTTIME
INFO_DATASET_7days_creatinine2 ----> LOS
skipping LOS
INFO_DATASET_7days_creatinine2 ----> GENDER
INFO_DATASET_7days_creatinine2 ----> DOB
skipping DOB
INFO_DATASET_7days_creatinine2 ----> STAYTIME
skipping STAYTIME
INFO_DATASET_7days_creatinine2 ----> AGE
INFO_DATASET_7days_creatinine2 ----> TIMEGOICU
skipping TIMEGOICU
INFO_DATASET_7days_creatinine2 ----> TIMEINICU
skipping TIMEINICU
INFO_DATASET_7days_creatinine2 ----> TIMEAFTERGOICU
skipping TIMEAFTERGOICU
INFO_DATASET_7days_creatinine2 ----> COUNTTIMESGOICU
INFO_DATASET

## Data balance

we'll load the preprocessed datasets and plot the amount of classes we have.


In [None]:
with open(Path(eicu_prepro / "INFO_DATASET_7days_creatinine2.csv")) as f:
    eicu_creatinine = pd.read_csv(f)
    eicu_creatinine.to_parquet(Path(eicu_prepro / "INFO_DATASET_7days_creatinine2.parquet"))
with open(Path(eicu_prepro / "INFO_DATASET_7days_creatinine+urine2.csv")) as f:
    temp = pd.read_csv(f)
    temp.to_parquet(Path(eicu_prepro / "INFO_DATASET_7days_creatinine+urine2.parquet"))
# mimic_creatinine.head(3)


In [None]:
mimic_creatinine.groupby('AKI_STAGE_7DAY')['AKI_STAGE_7DAY'].count()

In [None]:
mimic_creatinine.groupby('AKI_STAGE_7DAY')['ICUSTAY_ID'].count().plot.bar()