In [1]:
import os
import pandas as pd
import polars as pl
pd.set_option('display.max_rows', 100)
pl.Config.set_tbl_cols(100)
pl.Config.set_tbl_rows(100)

%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path

import polars as pl

from EventStream.data.dataset_polars import Dataset

DATA_DIR = Path("../MIMIC_ESD_new_schema_08-31-23-1")
ESD = Dataset.load(DATA_DIR)

events_df = ESD.events_df.filter(~pl.all_horizontal(pl.all().is_null()))
dynamic_measurements_df = ESD.dynamic_measurements_df.filter(~pl.all_horizontal(pl.all().is_null()))

df_data = (events_df
            .join(dynamic_measurements_df, on="event_id", how="left")
            .drop(['event_id'])
            .sort(by=['subject_id', 'timestamp', 'event_type'])
            )


# df_data = pl.read_csv("sample_data.csv")

  from .autonotebook import tqdm as notebook_tqdm


Updating config.save_dir from /n/data1/hms/dbmi/zaklab/RAMMS/data/MIMIC_IV/ESD_new_schema_08-31-23-1 to ../MIMIC_ESD_new_schema_08-31-23-1
Loading events from ../MIMIC_ESD_new_schema_08-31-23-1/events_df.parquet...
Loading dynamic_measurements from ../MIMIC_ESD_new_schema_08-31-23-1/dynamic_measurements_df.parquet...


### End-to-End

In [3]:
from task_querying_v2 import *

name = 'test_08'

config_path = f"test_configs/{name}.yaml"

res = query_task(config_path, df_data)
res

Loading config...

Generating predicate columns...

Added predicate column is_admission.
Added predicate column is_discharge.
Added predicate column is_death.
Added predicate column is_lab.
Added predicate column is_abnormal_lab_num.
Added predicate column is_heart_rate.
Added predicate column is_discharge_or_death.
Added predicate column is_abnormal_heart_rate.

Building tree...
trigger
┗━━ gap
    ┗━━ target


12127 subjects (14623763 rows) were excluded due to trigger condition: {'predicate': 'admission', 'min': 1, 'max': 1}.


Querying...


Querying subtree rooted at gap...
342 subjects (631 rows) were excluded due to constraint: [(col("is_admission")) <= (0)].
7143 subjects (17534 rows) were excluded due to constraint: [(col("is_discharge")) <= (0)].


Querying subtree rooted at target...
12115 subjects (66701 rows) were excluded due to constraint: [(col("is_abnormal_heart_rate")) >= (1)].


Done.



subject_id,trigger/timestamp,gap/timestamp,target/timestamp,gap/window_summary,target/window_summary,label
u16,datetime[μs],datetime[μs],datetime[μs],struct[9],struct[9],i32
1033,2177-05-15 21:06:00,2177-05-17 21:06:00,2177-06-06 12:35:00,"{0,0,0,0,0,0,1543,30,159}","{1,1,0,1,1,1,13348,371,1310}",0
11456,2118-10-24 21:32:00,2118-10-26 21:32:00,2118-11-05 16:37:00,"{0,0,0,0,0,0,1307,31,166}","{1,1,0,1,1,1,5399,124,402}",0


In [None]:
res.select(['target/window_summary']).unnest('target/window_summary')

is_admission,is_discharge,is_death,is_abnormal_lab_num,is_discharge_or_death,is_abnormal_heart_rate,is_lab,is_heart_rate,is_any
i32,i32,i32,i32,i32,i32,i32,i32,i32
1,1,0,1,1,1,13348,371,1310
1,1,0,1,1,1,5399,124,402


# Unit Testing

In [None]:
from simple_test_runner import *

In [None]:
df_test_1_input = pl.DataFrame(
    {
        'subject_id': [1, 1, 1],
        'timestamp': ['12/1/1900 12:00', '12/1/1900 13:00', '12/1/1900 14:00'],
        'event_type': ['ADMISSION', 'LAB', 'DISCHARGE'],
        'dx': ['', '', ''],
        'lab_test': ['', 'SpO2', ''],
        'lab_value': ['', '99', ''],
        'is_admission': [1, 0, 0],
        'is_lab': [0, 1, 0],
        'is_discharge': [0, 0, 1],
        'is_any': [1, 1, 1],
    }
)

df_test_1_predicate_cols = [col for col in df_test_1_input.columns if col.startswith('is_')]

df_test_1_endpoint_expr = (False, "is_discharge", True, None) 

test_1_anchor_to_subtree_root_by_subtree_anchor = pl.DataFrame(
    {
        'subject_id': [1],
        'timestamp': ['12/1/1900 12:00'],
        'is_admission': [0],
        'is_lab': [0],
        'is_discharge': [0],
        'is_any': [0],
    }
)

df_test_1_result = pl.DataFrame(
    {
        "subject_id": [1],
        "timestamp": ["12/1/1900 14:00"],
        "timestamp_at_anchor": ["12/1/1900 12:00"],
        "is_admission": [0],
        "is_lab": [1],
        "is_discharge": [1],
        "is_any": [2],
        "is_admission_summary": [0],
        "is_lab_summary": [0],
        "is_discharge_summary": [0],
        "is_any_summary": [0],
    }
)

In [None]:
test_1 = [
    {
        "message": "Testing summarize_event_bound_window, should be equal...",
        "args": (
            df_test_1_input, 
            df_test_1_predicate_cols, 
            df_test_1_endpoint_expr,
            test_1_anchor_to_subtree_root_by_subtree_anchor),
        "want": df_test_1_result,
    }
]

simple_test_runner(test_1, summarize_event_bound_window)

Testing summarize_event_bound_window, should be equal... Passed!


# Integration Testing

### Config Loading

In [None]:
from task_querying_v2 import *

In [None]:
data = pl.read_csv('sample_data.csv')
data = data.with_columns(pl.col('timestamp').str.strptime(pl.Datetime, format='%m/%d/%Y %H:%M').cast(pl.Datetime))

In [None]:
cfg = load_config('sample_config.yaml')
cfg

{'predicates': {'admission': {'column': 'event_type', 'value': 'ADMISSION'},
  'discharge': {'column': 'event_type', 'value': 'DISCHARGE'},
  'death': {'column': 'event_type', 'value': 'DEATH'},
  'covid': {'column': 'dx', 'value': 'COVID'},
  'discharge_or_death': {'type': 'ANY', 'predicates': ['discharge', 'death']}},
 'windows': {'trigger': {'start': 'admission',
   'duration': None,
   'offset': None,
   'end': 'admission',
   'excludes': None,
   'includes': [{'predicate': 'admission', 'min': 1, 'max': 1}],
   'st_inclusive': False,
   'end_inclusive': True},
  'gap': {'start': 'trigger.end',
   'duration': '48 hours',
   'offset': None,
   'end': 'gap.start + gap.duration',
   'excludes': [{'predicate': 'admission'}, {'predicate': 'discharge'}, {'predicate': 'death'}],
   'includes': None,
   'st_inclusive': False,
   'end_inclusive': True},
  'target': {'start': 'gap.end',
   'duration': None,
   'offset': None,
   'end': 'discharge_or_death',
   'excludes': None,
   'includes':

### Generate predicate columns

In [None]:
predicates_df = generate_predicate_columns(cfg, data)
predicates_df

AttributeError: 'DotAccessibleDict' object has no attribute 'system'

### Build tree

In [None]:
tree = build_tree_from_config(cfg)
print_tree(tree, style="const_bold")
for each_node in preorder_iter(tree):
    print(each_node)
cfg

trigger
┣━━ gap
┃   ┗━━ target
┗━━ input
Node(/trigger, constraints={'is_admission': (1, 1)}, endpoint_expr=(False, 'is_admission', True, datetime.timedelta(0)))
Node(/trigger/gap, constraints={'is_admission': (None, 0), 'is_discharge': (None, 0), 'is_death': (None, 0)}, endpoint_expr=(False, datetime.timedelta(days=2), True, datetime.timedelta(0)))
Node(/trigger/gap/target, constraints={'is_discharge_or_death': (1, 1)}, endpoint_expr=(True, 'is_discharge_or_death', True, datetime.timedelta(0)))
Node(/trigger/input, constraints={'is_covid': (None, 0), 'is_any': (5, None)}, endpoint_expr=(False, datetime.timedelta(days=30), True, datetime.timedelta(days=1)))


{'predicates': {'admission': {'column': 'event_type', 'value': 'ADMISSION'},
  'discharge': {'column': 'event_type', 'value': 'DISCHARGE'},
  'death': {'column': 'event_type', 'value': 'DEATH'},
  'covid': {'column': 'dx', 'value': 'COVID'},
  'discharge_or_death': {'type': 'ANY', 'predicates': ['discharge', 'death']}},
 'windows': {'trigger': {'start': 'admission',
   'duration': None,
   'offset': None,
   'end': 'admission',
   'excludes': None,
   'includes': [{'predicate': 'admission', 'min': 1, 'max': 1}],
   'st_inclusive': False,
   'end_inclusive': True},
  'gap': {'start': 'trigger.end',
   'duration': '48 hours',
   'offset': None,
   'end': 'gap.start + gap.duration',
   'excludes': [{'predicate': 'admission'}, {'predicate': 'discharge'}, {'predicate': 'death'}],
   'includes': None,
   'st_inclusive': False,
   'end_inclusive': True},
  'target': {'start': 'gap.end',
   'duration': None,
   'offset': None,
   'end': 'discharge_or_death',
   'excludes': None,
   'includes':

### Summarize window

In [None]:
import datetime

predicate_cols = [col for col in predicates_df.columns if col.startswith("is_")]
print(predicate_cols)

anchor_to_subtree_root_by_subtree_anchor = (
        predicates_df.filter(predicates_df['is_admission'] == 1)
            .select('subject_id', 'timestamp', *[pl.col(c) for c in predicate_cols])
            .with_columns('subject_id', 'timestamp', *[pl.lit(0).alias(c) for c in predicate_cols])
    )
print(anchor_to_subtree_root_by_subtree_anchor)

['is_admission', 'is_discharge', 'is_death', 'is_covid', 'is_discharge_or_death', 'is_any']
shape: (6, 8)
┌────────────┬─────────────┬─────────────┬─────────────┬──────────┬──────────┬────────────┬────────┐
│ subject_id ┆ timestamp   ┆ is_admissio ┆ is_discharg ┆ is_death ┆ is_covid ┆ is_dischar ┆ is_any │
│ ---        ┆ ---         ┆ n           ┆ e           ┆ ---      ┆ ---      ┆ ge_or_deat ┆ ---    │
│ i64        ┆ datetime[μs ┆ ---         ┆ ---         ┆ i32      ┆ i32      ┆ h          ┆ i32    │
│            ┆ ]           ┆ i32         ┆ i32         ┆          ┆          ┆ ---        ┆        │
│            ┆             ┆             ┆             ┆          ┆          ┆ i32        ┆        │
╞════════════╪═════════════╪═════════════╪═════════════╪══════════╪══════════╪════════════╪════════╡
│ 1          ┆ 1989-12-01  ┆ 0           ┆ 0           ┆ 0        ┆ 0        ┆ 0          ┆ 0      │
│            ┆ 12:03:00    ┆             ┆             ┆          ┆          ┆        

Temporally-bound

In [None]:
summarize_temporal_window(predicates_df=predicates_df, predicate_cols=predicate_cols, endpoint_expr=(False, timedelta(days=2), True, None), anchor_to_subtree_root_by_subtree_anchor=anchor_to_subtree_root_by_subtree_anchor)

subject_id,timestamp,timestamp_at_anchor,is_admission,is_discharge,is_death,is_covid,is_discharge_or_death,is_any,is_admission_summary,is_discharge_summary,is_death_summary,is_covid_summary,is_discharge_or_death_summary,is_any_summary
i64,datetime[μs],datetime[μs],i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
1,1989-12-01 12:03:00,1989-12-01 12:03:00,0,1,0,0,1,7,0,0,0,0,0,0
1,1991-01-27 23:32:00,1991-01-27 23:32:00,0,0,0,0,0,3,0,0,0,0,0,0
1,1991-03-03 19:33:00,1991-03-03 19:33:00,0,0,1,0,1,2,0,0,0,0,0,0
2,1996-03-08 02:24:00,1996-03-08 02:24:00,0,1,0,1,1,4,0,0,0,0,0,0
2,1996-06-05 00:32:00,1996-06-05 00:32:00,0,0,0,0,0,0,0,0,0,0,0,0
3,1996-03-08 02:24:00,1996-03-08 02:24:00,0,0,1,0,1,6,0,0,0,0,0,0


Event-bound

In [None]:
summarize_event_bound_window(predicates_df=predicates_df, predicate_cols=predicate_cols, endpoint_expr=(False, 'is_discharge_or_death', True, None), anchor_to_subtree_root_by_subtree_anchor=anchor_to_subtree_root_by_subtree_anchor)

subject_id,timestamp,timestamp_at_anchor,is_admission,is_discharge,is_death,is_covid,is_discharge_or_death,is_any,is_admission_summary,is_discharge_summary,is_death_summary,is_covid_summary,is_discharge_or_death_summary,is_any_summary
i64,datetime[μs],datetime[μs],i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
1,1989-12-02 15:00:00,1989-12-01 12:03:00,0,1,0,0,1,7,0,0,0,0,0,0
1,1991-01-31 02:15:00,1991-01-27 23:32:00,0,1,0,0,1,9,0,0,0,0,0,0
1,1991-03-03 21:38:00,1991-03-03 19:33:00,0,0,1,0,1,2,0,0,0,0,0,0
2,1996-03-08 16:00:00,1996-03-08 02:24:00,0,1,0,1,1,4,0,0,0,0,0,0
2,1996-06-08 03:00:00,1996-06-05 00:32:00,0,0,1,0,1,5,0,0,0,0,0,0
3,1996-03-10 00:00:00,1996-03-08 02:24:00,0,0,1,0,1,6,0,0,0,0,0,0
