# Goal

Our goal is to visualize the types of hospitalizations by major diagnostic category (MDC) using a lab value such as glucose:

https://idl.uw.edu/mosaic-framework-example/observable-latency

In [1]:
# Load duckdb, which lets us efficiently load large files
import duckdb

# Load pandas, which lets us manipulate dataframes
import pandas as pd

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Set configrations on jupysql to directly output data to Pandas and to simplify the output that is printed to the notebook.
%config SqlMagic.autopandas = True

%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to DuckDB using a SQLAlchemy-style connection string. Either connect to an in memory DuckDB, or a file backed db.
%sql duckdb:///:memory:

In [22]:
%%capture
%%sql
WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value,
        l.charttime AS measurement_time
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
)
SELECT 
    g.subject_id,
    g.hadm_id,
    g.glucose_value,
    g.measurement_time,
    d.drg_type,
    d.drg_code
FROM 
    glucose_measurements g
JOIN 
    drgcodes d
ON 
    g.subject_id = d.subject_id
    AND g.hadm_id = d.hadm_id
ORDER BY 
    g.subject_id, g.hadm_id, g.measurement_time

In [23]:
%%capture
%%sql
WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
        AND l.valuenum IS NOT NULL
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
),
ms_drg_to_mdc AS (
    SELECT
        ms_drg,
        mdc
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/ms_drg_to_mdc.parquet')
),
mdc_dictionary AS (
    SELECT
        mdc_number,
        mdc_description
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/mdc_dictionary.parquet')
),
mdc_counts AS (
    SELECT
        m.mdc,
        COUNT(*) AS mdc_count
    FROM
        glucose_measurements g
    JOIN 
        drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
    LEFT JOIN
        ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
    GROUP BY
        m.mdc
    HAVING
        COUNT(*) >= 20
)
SELECT 
    g.subject_id,
    g.hadm_id,
    g.glucose_value,
    d.drg_type,
    d.drg_code,
    m.mdc,
    md.mdc_description
FROM 
    glucose_measurements g
JOIN 
    drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
LEFT JOIN
    ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
LEFT JOIN
    mdc_dictionary md ON m.mdc = md.mdc_number
INNER JOIN
    mdc_counts mc ON m.mdc = mc.mdc
ORDER BY 
    g.subject_id, g.hadm_id, g.glucose_value

In [13]:
df = _

In [15]:
df['mdc_description'].value_counts()

mdc_description
Diseases and disorders of the circulatory system                                   577230
Diseases and disorders of the digestive system                                     259581
Infectious and parasitic diseases, systemic or unspecified sites                   241405
Diseases and disorders of the nervous system                                       211832
Diseases and disorders of the respiratory system                                   197244
Diseases and disorders of the hepatobiliary system and pancreas                    170848
Diseases and disorders of the musculoskeletal system and connective tissue         154653
Myeloproliferative diseases and disorders, poorly differentiated neoplasms         122313
Diseases and disorders of the kidney and urinary tract                             120123
Endocrine, nutritional and metabolic diseases and disorders                         76757
Injuries, poisonings and toxic effects of drugs                                     

# Aligning charttimes to the last measurement for every hospitalization

In [24]:
%%capture
%%sql 
WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value,
        l.charttime AS measurement_time
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
),
last_measurement_times AS (
    SELECT
        subject_id,
        hadm_id,
        MAX(measurement_time) AS last_measurement_time
    FROM
        glucose_measurements
    GROUP BY
        subject_id, hadm_id
)
SELECT 
    g.subject_id,
    g.hadm_id,
    g.glucose_value,
    g.measurement_time,
    TIMESTAMP '2000-01-01 00:00:00' + 
        INTERVAL (DATEDIFF('second', g.measurement_time, lmt.last_measurement_time)) SECOND 
        AS relative_measurement_time,
    d.drg_type,
    d.drg_code
FROM 
    glucose_measurements g
JOIN 
    drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
JOIN
    last_measurement_times lmt ON g.subject_id = lmt.subject_id AND g.hadm_id = lmt.hadm_id
ORDER BY 
    g.subject_id, g.hadm_id, g.measurement_time

In [41]:
%%capture
%%sql 

WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value,
        l.charttime AS measurement_time
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
        AND l.valuenum IS NOT NULL
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
),
ms_drg_to_mdc AS (
    SELECT
        ms_drg,
        mdc
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/ms_drg_to_mdc.parquet')
),
mdc_dictionary AS (
    SELECT
        mdc_number,
        mdc_description
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/mdc_dictionary.parquet')
),
mdc_counts AS (
    SELECT
        m.mdc,
        COUNT(*) AS mdc_count
    FROM
        glucose_measurements g
    JOIN 
        drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
    LEFT JOIN
        ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
    GROUP BY
        m.mdc
    HAVING
        COUNT(*) >= 20
),
last_measurement_times AS (
    SELECT
        subject_id,
        hadm_id,
        MAX(measurement_time) AS last_measurement_time
    FROM
        glucose_measurements
    GROUP BY
        subject_id, hadm_id
)
SELECT 
    g.subject_id,
    g.hadm_id,
    g.glucose_value,
    g.measurement_time,
    TIMESTAMP '2000-01-01 00:00:00' + 
        INTERVAL (DATEDIFF('second', g.measurement_time, lmt.last_measurement_time)) SECOND 
        AS relative_measurement_time,
    d.drg_type,
    d.drg_code,
    m.mdc,
    md.mdc_description
FROM 
    glucose_measurements g
JOIN 
    drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
LEFT JOIN
    ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
LEFT JOIN
    mdc_dictionary md ON m.mdc = md.mdc_number
INNER JOIN
    mdc_counts mc ON m.mdc = mc.mdc
JOIN
    last_measurement_times lmt ON g.subject_id = lmt.subject_id AND g.hadm_id = lmt.hadm_id
ORDER BY 
    g.subject_id, g.hadm_id, g.measurement_time

In [28]:
%%sql

SELECT * FROM '~/Downloads/observable-latency.parquet' LIMIT 10;

Unnamed: 0,count,route,time,latency
0,1,/user,2024-01-26 00:05:00,0.5
1,2,/document/{id}/file,2024-01-26 00:10:00,0.5
2,2,/user,2024-01-26 00:20:00,0.5
3,1,/user,2024-01-26 00:30:00,0.5
4,2,/user,2024-01-26 00:35:00,0.5
5,2,/document/{id}/fork,2024-01-26 00:45:00,0.5
6,2,/user/likes,2024-01-26 00:50:00,0.5
7,3,/user,2024-01-26 01:00:00,0.5
8,2,/user,2024-01-26 01:25:00,0.5
9,3,/user,2024-01-26 01:30:00,0.5


In [39]:
%%sql

WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value,
        l.charttime AS measurement_time
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
        AND l.valuenum IS NOT NULL
),
median_glucose AS (
    SELECT
        subject_id,
        hadm_id,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY glucose_value) AS median_glucose_value,
        MIN(measurement_time) AS admission_time
    FROM
        glucose_measurements
    GROUP BY
        subject_id, hadm_id
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
),
ms_drg_to_mdc AS (
    SELECT
        ms_drg,
        mdc
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/ms_drg_to_mdc.parquet')
),
mdc_dictionary AS (
    SELECT
        mdc_number,
        mdc_description
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/mdc_dictionary.parquet')
),
mdc_data AS (
    SELECT 
        mg.subject_id,
        mg.hadm_id,
        mg.median_glucose_value,
        mg.admission_time,
        m.mdc,
        md.mdc_description
    FROM 
        median_glucose mg
    JOIN 
        drgcodes d ON mg.subject_id = d.subject_id AND mg.hadm_id = d.hadm_id
    LEFT JOIN
        ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
    LEFT JOIN
        mdc_dictionary md ON m.mdc = md.mdc_number
)
SELECT 
    COUNT(*) AS count,
    mdc_description AS route,
    DATE_TRUNC('day', admission_time) AS time,
    CAST(FLOOR(median_glucose_value / 5) * 5 AS INTEGER) AS glucose
FROM 
    mdc_data
WHERE
    mdc IS NOT NULL
GROUP BY 
    mdc_description,
    DATE_TRUNC('day', admission_time),
    FLOOR(median_glucose_value / 5)
ORDER BY 
    time DESC, glucose ASC

Unnamed: 0,count,route,time,glucose
0,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-16,130
1,1,Diseases and disorders of the circulatory system,2214-10-01,115
2,1,"Endocrine, nutritional and metabolic diseases ...",2214-09-18,135
3,1,Diseases and disorders of the respiratory system,2214-08-15,125
4,1,Diseases and disorders of the kidney and urina...,2214-08-01,95
...,...,...,...,...
317389,1,"Myeloproliferative diseases and disorders, poo...",2110-01-12,165
317390,1,Diseases and disorders of the circulatory system,2110-01-11,105
317391,1,Diseases and disorders of the circulatory system,2110-01-11,125
317392,1,"Infectious and parasitic diseases, systemic or...",2109-12-14,95


In [32]:
%%sql

WITH time_range AS (
    SELECT 
        MIN(time) AS min_time,
        MAX(time) AS max_time
    FROM 
        read_parquet('~/data/physionet.org/figures/glucose_time.parquet')
)
SELECT 
    EXTRACT(EPOCH FROM min_time) * 1000 AS min_time_ms,
    EXTRACT(EPOCH FROM max_time) * 1000 AS max_time_ms
FROM 
    time_range

Unnamed: 0,min_time_ms,max_time_ms
0,948844800000.0,993366700000.0


In [33]:
df = _

In [38]:
df.values[0][0], df.values[0][1]

(948844800000.0, 993366720000.0)

# Try relative time again

In [40]:
%%sql 

WITH glucose_measurements AS (
    SELECT 
        l.subject_id,
        l.hadm_id,
        l.valuenum AS glucose_value,
        l.charttime AS measurement_time
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/labevents.parquet') l
    JOIN 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/d_labitems.parquet') d
    ON 
        l.itemid = d.itemid
    WHERE 
        d.label = 'Glucose'
        AND l.valuenum IS NOT NULL
),
drgcodes AS (
    SELECT 
        subject_id,
        hadm_id,
        drg_type,
        drg_code
    FROM 
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/drgcodes.parquet')
    WHERE 
        drg_type = 'HCFA'
),
ms_drg_to_mdc AS (
    SELECT
        ms_drg,
        mdc
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/ms_drg_to_mdc.parquet')
),
mdc_dictionary AS (
    SELECT
        mdc_number,
        mdc_description
    FROM
        read_parquet('~/data/physionet.org/processed/mimiciv/hosp/mdc_dictionary.parquet')
),
mdc_data AS (
    SELECT 
        g.subject_id,
        g.hadm_id,
        g.glucose_value,
        g.measurement_time,
        m.mdc,
        md.mdc_description
    FROM 
        glucose_measurements g
    JOIN 
        drgcodes d ON g.subject_id = d.subject_id AND g.hadm_id = d.hadm_id
    LEFT JOIN
        ms_drg_to_mdc m ON CAST(d.drg_code AS INTEGER) = m.ms_drg
    LEFT JOIN
        mdc_dictionary md ON m.mdc = md.mdc_number
)
SELECT 
    COUNT(*) AS count,
    mdc_description AS route,
    DATE_TRUNC('hour', measurement_time) AS time,
    CAST(FLOOR(glucose_value / 5) * 5 AS INTEGER) AS glucose
FROM 
    mdc_data
WHERE
    mdc IS NOT NULL
GROUP BY 
    mdc_description,
    DATE_TRUNC('hour', measurement_time),
    FLOOR(glucose_value / 5)
ORDER BY 
    time DESC, glucose ASC

Unnamed: 0,count,route,time,glucose
0,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-24 07:00:00,115
1,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-23 06:00:00,100
2,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-22 06:00:00,130
3,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-21 07:00:00,145
4,1,"Endocrine, nutritional and metabolic diseases ...",2214-12-20 08:00:00,160
...,...,...,...,...
2304789,1,Diseases and disorders of the digestive system,2105-10-05 14:00:00,245
2304790,1,Diseases and disorders of the digestive system,2105-10-05 12:00:00,235
2304791,1,Diseases and disorders of the digestive system,2105-10-05 11:00:00,240
2304792,1,Diseases and disorders of the digestive system,2105-10-05 07:00:00,215
