# Example with ESGPT Synthetic Data

### Set-up

Imports

In [1]:
import os
import json
from pathlib import Path

import pandas as pd
import polars as pl
from bigtree import print_tree
from EventStream.data.dataset_polars import Dataset

from esgpt_task_querying import main, config, event_predicates, query

Directories

In [2]:
config_path = "sample_configs/inhospital_mortality.yaml"
data_path = "sample_data/esgpt_sample"

Configuration File

In [3]:
cfg = config.load_config(config_path)
print(json.dumps(cfg, indent=4))

{
    "predicates": {
        "admission": {
            "column": "event_type",
            "value": "ADMISSION",
            "system": "boolean"
        },
        "discharge": {
            "column": "event_type",
            "value": "DISCHARGE",
            "system": "boolean"
        },
        "death": {
            "column": "event_type",
            "value": "DEATH",
            "system": "boolean"
        },
        "discharge_or_death": {
            "type": "ANY",
            "predicates": [
                "discharge",
                "death"
            ],
            "system": "boolean"
        },
        "any": {
            "type": "special"
        }
    },
    "windows": {
        "trigger": {
            "start": "admission",
            "end": "admission",
            "st_inclusive": false,
            "end_inclusive": true
        },
        "gap": {
            "start": "trigger.end",
            "duration": "48 hours",
            "excludes": [
                {

Task Tree

In [4]:
tree = config.build_tree_from_config(cfg)
print_tree(tree)

trigger
├── gap
│   └── target
└── input


Data

In [5]:
ESD = Dataset.load(Path(data_path))
events_df = ESD.events_df
dynamic_measurements_df = ESD.dynamic_measurements_df

display(events_df)
display(dynamic_measurements_df)

Updating config.save_dir from /home/justinxu/esgpt/EventStreamGPT/sample_data/processed/sample to sample_data/esgpt_sample
Loading events from sample_data/esgpt_sample/events_df.parquet...
Loading dynamic_measurements from sample_data/esgpt_sample/dynamic_measurements_df.parquet...


event_id,subject_id,timestamp,event_type,age,age_is_inlier
u32,u8,datetime[μs],cat,f64,bool
0,0,2010-06-24 13:23:00,"""ADMISSION&VITA…",-0.463849,true
1,0,2010-06-24 14:23:00,"""VITAL&LAB""",-0.463823,true
2,0,2010-06-24 15:23:00,"""VITAL&LAB""",-0.463796,true
3,0,2010-06-24 16:23:00,"""VITAL&LAB""",-0.46377,true
4,0,2010-06-24 17:23:00,"""VITAL&LAB""",-0.463744,true
…,…,…,…,…,…
30938,99,2010-11-20 08:20:06,"""VITAL&LAB""",-1.007141,true
30939,99,2010-11-20 09:20:06,"""VITAL&LAB""",-1.007115,true
30940,99,2010-11-20 10:20:06,"""VITAL&LAB""",-1.007088,true
30941,99,2010-11-20 11:20:06,"""VITAL&LAB""",-1.007062,true


measurement_id,department,HR,temp,lab_name,lab_value,event_id,HR_is_inlier,temp_is_inlier,lab_name_is_inlier
u32,cat,f64,f64,cat,f64,u32,bool,bool,bool
0,"""CARDIAC""",,,,,26188,,,
1,"""ORTHOPEDIC""",,,,,29488,,,
2,"""PULMONARY""",,,,,24213,,,
3,"""ORTHOPEDIC""",,,,,28929,,,
4,"""PULMONARY""",,,,,3754,,,
…,…,…,…,…,…,…,…,…,…
92933,,,,"""SpO2""",-0.195462,12946,,,true
92934,,,,"""SpO2""",-0.563824,7077,,,true
92935,,,,"""SpO2""",0.817535,22925,,,true
92936,,,,"""GCS__EQ_9""",,23454,,,


Predicate Columns

In [6]:
df_predicates = event_predicates.generate_predicate_columns(cfg, [events_df, dynamic_measurements_df])
display(df_predicates)

[32m2024-04-20 07:10:05.263[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_simple_predicates[0m:[36m104[0m - [34m[1mAdded predicate column 'is_admission'.[0m
[32m2024-04-20 07:10:05.265[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_simple_predicates[0m:[36m104[0m - [34m[1mAdded predicate column 'is_discharge'.[0m
[32m2024-04-20 07:10:05.268[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_simple_predicates[0m:[36m104[0m - [34m[1mAdded predicate column 'is_death'.[0m
[32m2024-04-20 07:10:05.280[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_predicate_columns[0m:[36m263[0m - [34m[1mAdded predicate column 'is_discharge_or_death'.[0m
[32m2024-04-20 07:10:05.282[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_predicate_columns[0m:[36m268[0m - [34m[

subject_id,timestamp,is_admission,is_discharge,is_death,is_discharge_or_death,is_any
u8,datetime[μs],i32,i32,i32,i32,i32
0,2010-06-24 13:23:00,1,0,0,0,1
0,2010-06-24 14:23:00,0,0,0,0,1
0,2010-06-24 15:23:00,0,0,0,0,1
0,2010-06-24 16:23:00,0,0,0,0,1
0,2010-06-24 17:23:00,0,0,0,0,1
…,…,…,…,…,…,…
99,2010-11-20 08:20:06,0,0,0,0,1
99,2010-11-20 09:20:06,0,0,0,0,1
99,2010-11-20 10:20:06,0,0,0,0,1
99,2010-11-20 11:20:06,0,0,0,0,1


### End-to-End Query

In [7]:
df_result = main.query_task(config_path, data_path)
display(df_result)

[32m2024-04-20 07:10:05.295[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.main[0m:[36mquery_task[0m:[36m30[0m - [34m[1mLoading config...[0m
[32m2024-04-20 07:10:05.307[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.main[0m:[36mquery_task[0m:[36m36[0m - [34m[1mData path provided, loading using ESGPT...[0m
[32m2024-04-20 07:10:05.328[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.main[0m:[36mquery_task[0m:[36m48[0m - [34m[1mGenerating predicate columns...[0m
[32m2024-04-20 07:10:05.332[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_simple_predicates[0m:[36m104[0m - [34m[1mAdded predicate column 'is_admission'.[0m
[32m2024-04-20 07:10:05.336[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.event_predicates[0m:[36mgenerate_simple_predicates[0m:[36m104[0m - [34m[1mAdded predicate column 'is_discharge'.[0m
[32m2024-04-20 07:10:05.339[0m | [34m[1mDEBUG   [0m | [36mesgpt_t

Updating config.save_dir from /home/justinxu/esgpt/EventStreamGPT/sample_data/processed/sample to sample_data/esgpt_sample
Loading events from sample_data/esgpt_sample/events_df.parquet...
Loading dynamic_measurements from sample_data/esgpt_sample/dynamic_measurements_df.parquet...
trigger
┣━━ gap
┃   ┗━━ target
┗━━ input


[32m2024-04-20 07:10:05.406[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.query[0m:[36mquery_subtree[0m:[36m246[0m - [34m[1mQuerying subtree rooted at 'target'...[0m
[32m2024-04-20 07:10:05.470[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.query[0m:[36mcheck_constraints[0m:[36m227[0m - [34m[1m11 subjects (12 rows) were excluded due to constraint: [(col("is_discharge_or_death")) <= (1)].[0m
[32m2024-04-20 07:10:05.486[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.query[0m:[36mquery_subtree[0m:[36m246[0m - [34m[1mQuerying subtree rooted at 'input'...[0m
[32m2024-04-20 07:10:05.515[0m | [34m[1mDEBUG   [0m | [36mesgpt_task_querying.main[0m:[36mquery_task[0m:[36m132[0m - [34m[1mDone.[0m


subject_id,trigger/timestamp,gap/timestamp,target/timestamp,input/timestamp,gap/window_summary,target/window_summary,input/window_summary,label
u8,datetime[μs],datetime[μs],datetime[μs],datetime[μs],struct[5],struct[5],struct[5],i32
0,2010-10-04 17:23:00,2010-10-06 17:23:00,2010-10-16 00:23:00,2010-10-06 17:23:00,"{0,0,0,0,48}","{1,1,0,1,224}","{0,0,0,0,48}",0
1,2010-02-12 20:16:13,2010-02-14 20:16:13,2010-02-15 17:16:13,2010-02-14 20:16:13,"{0,0,0,0,47}","{1,1,0,1,22}","{0,0,0,0,47}",0
2,2010-01-18 23:07:07,2010-01-20 23:07:07,2010-01-30 19:07:07,2010-01-20 23:07:07,"{0,0,0,0,40}","{1,1,0,1,211}","{0,0,0,0,40}",0
3,2010-02-19 03:48:21,2010-02-21 03:48:21,2010-02-28 18:48:21,2010-02-21 03:48:21,"{0,0,0,0,46}","{1,1,0,1,175}","{0,0,0,0,46}",0
3,2010-08-14 02:48:21,2010-08-16 02:48:21,2010-08-25 00:48:21,2010-08-16 02:48:21,"{0,0,0,0,44}","{1,1,0,1,202}","{0,0,0,0,44}",0
…,…,…,…,…,…,…,…,…
98,2010-06-28 22:25:52,2010-06-30 22:25:52,2010-07-12 13:25:52,2010-06-30 22:25:52,"{0,0,0,0,42}","{1,1,0,1,243}","{0,0,0,0,42}",0
98,2010-08-28 00:25:52,2010-08-30 00:25:52,2010-09-01 19:25:52,2010-08-30 00:25:52,"{0,0,0,0,41}","{1,1,0,1,59}","{0,0,0,0,41}",0
99,2010-04-15 18:20:06,2010-04-17 18:20:06,2010-04-23 19:20:06,2010-04-17 18:20:06,"{0,0,0,0,45}","{1,1,0,1,132}","{0,0,0,0,45}",0
99,2010-10-12 22:20:06,2010-10-14 22:20:06,2010-10-21 03:20:06,2010-10-14 22:20:06,"{0,0,0,0,44}","{1,1,0,1,131}","{0,0,0,0,44}",0
