In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

# Install duckdb
!pip install duckdb --quiet
import duckdb
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

from utils import load_file, sql_from_MIMICIII, age, ethnicity_to_ohe, generate_series_data, _word2pat, _map_drug_name_2_category, _site_from_specimen_type, split_data, data_norm, imputation_by_baseline, generate_series_data
from config import get_config
from sql import get_sql_queries
from name_keywords import get_keywords_cat
from preprocess_pipeline import preprocess_data

config_dict = get_config()

Mounted at /content/drive


In [2]:
# connect to MIMIC-III drive DB
# for this to work we have to add a shortcut to MIMIC-III dir to given drive
try:
  con = duckdb.connect(f'{config_dict['data_paths']['DRIVE_PATH']}/{config_dict['data_paths']['MIMIC_DB_NAME']}')
  print("Connection to MIMIC-III drive DB done successfuly.")
except:
  print("Error: connection to MIMIC-III drive DB was unsuccessful.")
  print("Please ensure drive has a shortcut to MIMIC-III dir and that drive_path is correct.")

Connection to MIMIC-III drive DB done successfuly.


In [3]:
# load the initial cohort from the provided CSV file
subject_ids = load_file(config_dict['data_paths']['DATA_PATH'], config_dict['data_paths']['INITIAL_COHORT'])
if subject_ids is not None:
  subject_ids = subject_ids['subject_id'].tolist()

Successfully loaded initial_cohort.csv


In [4]:
df = preprocess_data(subject_ids, con)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

ICU data preprocessed successfully
Successfully loaded labs_metadata.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Laboratory test results data preprocessed successfully
Successfully loaded vital_metadata.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Vital signs data preprocessed successfully


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Prescriptions data data uploaded and feature engineering done for it successfully


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Microbiology events data uploaded and feature engineering done for it successfully
Vital signs and lab data feature engineering done successfully
Pre-process done


# Target Definition and Partition

In [5]:
X = df.drop(columns=['hadm_id','dischtime','dod','dob','deathtime','mortality','prolonged_stay','readmission','sec_admittime'])
y = df[['subject_id'] + config_dict['TASKS']]

# split data into Train-Test-Validation (80%-20%-20%)
X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, df["subject_id"])

# Standardization and Imputation

In [6]:
# excluding numerical but not continuous features (categorical and IDs)
numeric_cols = X_train.select_dtypes(include='float').columns.tolist()
numeric_cols = [c for c in numeric_cols if X_train[c].nunique() >= 10]+['age']

# Standardization (fit scaler by TRAIN only)
X_train, scaler = data_norm(X_train, numeric_cols, scaler=None)
X_test, _ = data_norm(X_test, numeric_cols, scaler=scaler)
X_val, _ = data_norm(X_val, numeric_cols, scaler=scaler)

# Imputation by first‐day baseline (calc baseline by TRAIN only)
X_train, baseline = imputation_by_baseline(X_train, numeric_cols, baseline=None)
X_test, _ = imputation_by_baseline(X_test, numeric_cols, baseline=baseline)
X_val, _ = imputation_by_baseline(X_val, numeric_cols, baseline=baseline)

# other columns imputation by 0
X_train, X_test, X_val = X_train.fillna(0), X_test.fillna(0), X_val.fillna(0)

X_train = X_train.drop(columns='admittime').reset_index(drop=True)
X_test = X_test.drop(columns='admittime').reset_index(drop=True)
X_val = X_val.drop(columns='admittime').reset_index(drop=True)

print(f"train → X: {X_train.shape}, y: {y_train.shape}")
print(f"val   → X: {X_val.shape},   y: {y_val.shape}")
print(f"test  → X: {X_test.shape},  y: {y_test.shape}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_cols] = df[numeric_cols].astype("float64")


train → X: (152694, 264), y: (152694, 4)
val   → X: (19008, 264),   y: (19008, 4)
test  → X: (19261, 264),  y: (19261, 4)


# time series format

In [7]:
# Generate padded sequences + masks
X_train_seq, y_train_seq, mask_train, subject_ids_train = generate_series_data(X_train, y_train, time_col="charttime")
X_val_seq, y_val_seq, mask_val, subject_ids_val = generate_series_data(X_val, y_val, time_col="charttime")
X_test_seq, y_test_seq, mask_test, subject_ids_test = generate_series_data(X_test, y_test, time_col="charttime")

print("Sequence shapes:")
print(" X_train_seq:", X_train_seq.shape, " y_tarin_lables:", y_train_seq.shape, " mask:", mask_train.shape)
print(" X_val_seq:  ", X_val_seq.shape, " y_val_lables:", y_val_seq.shape," mask:", mask_val.shape)
print(" X_test_seq: ", X_test_seq.shape, " y_test_lables:", y_test_seq.shape," mask:", mask_test.shape)

Sequence shapes:
 X_train_seq: (22108, 9, 262)  y_tarin_lables: (22108, 3)  mask: (22108, 9)
 X_val_seq:   (2764, 9, 262)  y_val_lables: (2764, 3)  mask: (2764, 9)
 X_test_seq:  (2764, 9, 262)  y_test_lables: (2764, 3)  mask: (2764, 9)


# Save Data

In [8]:
datasets = [("X_train_seq", X_train_seq), ("y_train_lables", y_train_seq), ("mask_train", mask_train),
            ("X_val_seq", X_val_seq), ("y_val_lables", y_val_seq), ("mask_val", mask_val),
            ("X_test_seq", X_test_seq), ("y_test_lables", y_test_seq), ("mask_test", mask_test)]
for name, data in datasets:
  with open(f'data/{name}.pkl', 'wb') as handle:
      pkl.dump(data, handle)

train_features = df.drop(columns=['subject_id','mortality','prolonged_stay','readmission','hadm_id','dischtime','dod','dob','deathtime','charttime','sec_admittime','admittime']).columns.to_list()
models_params_dict = {"numeric_cols":numeric_cols, "scaler": scaler, "baseline": baseline, "train_features" : train_features, "tasks" : config_dict["TASKS"]}
with open(f'data/models_params_dict.pkl', 'wb') as handle:
    pkl.dump(models_params_dict, handle)