# MIMIC-IV Data Extraction

This notebook runs all SQL scripts to extract the MIMIC-IV AKI cohort data.

**Prerequisites:**
- Google Cloud BigQuery access
- Access to `physionet-data` project (MIMIC-IV)
- Project `windy-forge-475207-e3` with `derived_mimic` dataset

# Import libraries

In [1]:
from google.cloud import bigquery
from pandas_gbq import read_gbq
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone

PROJECT = "windy-forge-475207-e3"
DATASET = "derived_mimic"

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 200)

# Create dataset

In [3]:
client = bigquery.Client(project=PROJECT)

# Create derived_mimic dataset (US region)
dataset_id = f"{PROJECT}.{DATASET}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"  # MIMIC-IV is in US region
client.create_dataset(dataset, exists_ok=True)
print(f"Dataset {DATASET} exists (US region)")

# Create derived dataset for unified config (US region)
unified_dataset_id = f"{PROJECT}.derived"
unified_dataset = bigquery.Dataset(unified_dataset_id)
unified_dataset.location = "US"
client.create_dataset(unified_dataset, exists_ok=True)
print(f"Dataset derived exists (US region)")

Dataset derived_mimic exists (US region)
Dataset derived exists (US region)


# Helper function

In [4]:
def run_sql(filename: str):
    """Run a SQL file and print completion time."""
    client = bigquery.Client(project=PROJECT, location="US")
    sql = Path(filename).read_text(encoding="utf-8")
    job = client.query(sql)
    job.result()  # wait for completion
    print(f"{filename} - Ready at: {datetime.now(timezone.utc).astimezone().strftime('%Y-%m-%d %H:%M:%S %Z')}")

## SQL0 - Config parameters

In [5]:
#run_sql("SQL0_cfgparams_MIMIC.sql")

In [6]:
run_sql("cfg_params_unified.sql")

cfg_params_unified.sql - Ready at: 2026-01-27 00:19:20 West-Europa (standaardtijd)


## SQL1 - Utils (visit_times, demographics, death, var_map, weight, UO rates)

In [7]:
run_sql("SQL1_utils_MIMIC.sql")

SQL1_utils_MIMIC.sql - Ready at: 2026-01-27 00:19:47 West-Europa (standaardtijd)


## SQL2 - Cohort selection (KDIGO staging)

In [8]:
run_sql("SQL2_cohort_MIMIC.sql")

SQL2_cohort_MIMIC.sql - Ready at: 2026-01-27 00:19:55 West-Europa (standaardtijd)


## SQL3 - Time grid

In [9]:
run_sql("SQL3_grid_MIMIC.sql")

SQL3_grid_MIMIC.sql - Ready at: 2026-01-27 00:20:01 West-Europa (standaardtijd)


## SQL4 - Static variables

In [10]:
run_sql("SQL4_static_MIMIC.sql")

SQL4_static_MIMIC.sql - Ready at: 2026-01-27 00:20:07 West-Europa (standaardtijd)


## SQL5 - Varying variables (labs + vitals)

In [11]:
run_sql("SQL5_varying_variables_MIMIC.sql")

SQL5_varying_variables_MIMIC.sql - Ready at: 2026-01-27 00:20:19 West-Europa (standaardtijd)


## SQL5.1 - Varying variables censored (stat valid)

In [12]:
run_sql("SQL5_1_varying_variables_censored_MIMIC.sql")

SQL5_1_varying_variables_censored_MIMIC.sql - Ready at: 2026-01-27 00:20:30 West-Europa (standaardtijd)


## SQL6 - Last variables per grid point

In [13]:
run_sql("SQL6_last_variables_MIMIC.sql")

SQL6_last_variables_MIMIC.sql - Ready at: 2026-01-27 00:20:49 West-Europa (standaardtijd)


## SQL7 - Urine output

In [14]:
run_sql("SQL7_urine_output_MIMIC.sql")


SQL7_urine_output_MIMIC.sql - Ready at: 2026-01-27 00:20:56 West-Europa (standaardtijd)


## SQL8 - Renal trends

In [15]:
run_sql("SQL8_renal_trends_MIMIC.sql")

SQL8_renal_trends_MIMIC.sql - Ready at: 2026-01-27 00:21:12 West-Europa (standaardtijd)


## SQL9 - Vasopressors

In [16]:
run_sql("SQL9_vasopressors_MIMIC.sql")

SQL9_vasopressors_MIMIC.sql - Ready at: 2026-01-27 00:21:18 West-Europa (standaardtijd)


## SQL10 - Mechanical ventilation

In [17]:
run_sql("SQL10_mechanical_ventilation_MIMIC.sql")

SQL10_mechanical_ventilation_MIMIC.sql - Ready at: 2026-01-27 00:21:24 West-Europa (standaardtijd)


## SQL11 - Fluid balance

In [18]:
run_sql("SQL11_fluid_balance_MIMIC.sql")

SQL11_fluid_balance_MIMIC.sql - Ready at: 2026-01-27 00:22:09 West-Europa (standaardtijd)


## SQL12 - GCS

In [19]:
run_sql("SQL12_gcs_MIMIC.sql") 

SQL12_gcs_MIMIC.sql - Ready at: 2026-01-27 00:22:18 West-Europa (standaardtijd)


## SQL14 - SOFA components

In [20]:
run_sql("SQL14_sofa_cardio_MIMIC.sql")

SQL14_sofa_cardio_MIMIC.sql - Ready at: 2026-01-27 00:22:27 West-Europa (standaardtijd)


In [21]:
run_sql("SQL14_sofa_coag_MIMIC.sql")

SQL14_sofa_coag_MIMIC.sql - Ready at: 2026-01-27 00:22:33 West-Europa (standaardtijd)


In [22]:
run_sql("SQL14_sofa_liver_MIMIC.sql")

SQL14_sofa_liver_MIMIC.sql - Ready at: 2026-01-27 00:22:40 West-Europa (standaardtijd)


In [23]:
run_sql("SQL14_sofa_neuro_MIMIC.sql")

SQL14_sofa_neuro_MIMIC.sql - Ready at: 2026-01-27 00:22:52 West-Europa (standaardtijd)


In [24]:
run_sql("SQL14_sofa_renal_MIMIC.sql")

SQL14_sofa_renal_MIMIC.sql - Ready at: 2026-01-27 00:23:00 West-Europa (standaardtijd)


In [25]:
run_sql("SQL14_sofa_resp_MIMIC.sql")

SQL14_sofa_resp_MIMIC.sql - Ready at: 2026-01-27 00:23:11 West-Europa (standaardtijd)


## SQL15 - Final dataframe

In [26]:
run_sql("SQL15_final_dataframe_MIMIC.sql")

SQL15_final_dataframe_MIMIC.sql - Ready at: 2026-01-27 00:23:28 West-Europa (standaardtijd)


# Download and save dataset

In [27]:
# Download final dataset from BigQuery
query = f"""
SELECT * 
FROM `{PROJECT}.{DATASET}.grid_master_all_features`
"""

df = pd.read_gbq(query, project_id=PROJECT)

# Save locally as parquet
output_path = Path(r"C:\Users\karel\Desktop\data\Thesis\Data\MIMIC_start\Full_dataset\mimic_rrt_raw.parquet")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(output_path)

print(f"Dataset saved: {output_path}")
print(f"Shape: {df.shape}")

  df = pd.read_gbq(query, project_id=PROJECT)


Dataset saved: C:\Users\karel\Desktop\data\Thesis\Data\MIMIC_start\Full_dataset\mimic_rrt_raw.parquet
Shape: (458553, 132)


# Quick summary

In [28]:
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"\nUnique stays: {df['visit_occurrence_id'].nunique():,}")
print(f"Unique subjects: {df['person_id'].nunique():,}")
print(f"\nTerminal events:")
print(df['terminal_event'].value_counts())

Total rows: 458,553
Total columns: 132

Unique stays: 37,724
Unique subjects: 29,990

Terminal events:
terminal_event
discharge    32177
death         3134
rrt_start     2413
Name: count, dtype: int64
