In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

from fiber import Cohort
from fiber.condition import (
    Procedure, 
    Diagnosis,
    VitalSign, 
    LabValue, 
    Encounter
)
from fiber.utils import Timer
from fiber.storage.yaml import get_condition

# Cohort Definition

We are considering patients that had a bypass surgery (ICD-9: `35.*`) or valve surgery (ICD-9: `36.1*`) that are at least 18 y/o.

A limitation in this definition is that we cannot (as in other studies), reduce the patients to ICU patients. Our best approximation is to ensure the encounter type `Inpatient`, which we will check manually.

In [None]:
heart_surgery_condition = (
    Procedure(code='35.%', context='ICD-9').age(min_age=18) | 
    Procedure(code='36.1%', context='ICD-9').age(min_age=18)
)

In [None]:
heart_surgery_cohort = Cohort(heart_surgery_condition)
print(f'{len(heart_surgery_cohort.mrns())} patients in the cohort')

# Demographics
FIBER exposes some demographics information on the cohort, such as gender and age distribution.

In [None]:
demographics = heart_surgery_cohort.demographics

In [None]:
demographics["age"]["figure"]

In [None]:
demographics["gender"]["figure"]

# Onsets
We are interested in different outcomes for the patients in the cohort for different time intervals, e.g. the (re-)hospitalization in the second week after the heart surgery.
- Mortality 0, 7, 14 and 28 days 
    - This cannot be done on the interval level, because of missing data. The only approximation we can offer is the 
    `deceased_indicator` in the patient information.
- Acute Kidney Injury,  ICD-9 code `584.*` or AKI phenotype
- Stroke (Cerebrovascular event), 0, 7, 14 and 28 days
	- Occlusion and stenosis of precerebral arteries, ICD-9 code `433.*`
	- Occlusion of cerebral arteries, ICD-9 code `434.*`
	- Acute but ill-defined cerebrovascular disease, ICD-9 code `436.*`
- (Re-)Hospitalization in the {first, second, third, fourth} week after the heart surgery
- Emergency Encounter in the {first, second, third, fourth} week after the heart surgery

### TODO: 
- We could also use the more complex AKI phenotype as defined by AKIN and KDIGO.

In [None]:
aki = heart_surgery_cohort.has_onset(
    name="aki",
    condition=Diagnosis(code="584.%", context="ICD-9"),    
)

In [None]:
stroke = heart_surgery_cohort.has_onset(
    name="stroke",
    condition=Diagnosis(code='433.%', context="ICD-9") | Diagnosis(code='434.%', context="ICD-9") | Diagnosis(code='436.%', context="ICD-9"), 
)

In [None]:
hospitalization_cond = Encounter(category='Inpatient')
hospitalization = heart_surgery_cohort.has_onset(
    name="hospitalization",
    condition=hospitalization_cond,
    time_windows=[(1,7),(8,14),(15,21),(22,28)]
)

In [None]:
emergency_cond = Encounter(category='Emergency')
emergency = heart_surgery_cohort.has_onset(
    name="emergency",
    condition=emergency_cond,
    time_windows=[(1,7),(8,14),(15,21),(22,28)]
)

# Preconditions

We find out which patients are affected from a list of common comorbidities or influencing factors of heart surgery outcomes.

We are implicitly considering the entire medical history of a patient as known to Mt Sinai Health Systems. 

In [None]:
preconditions = {}

In [None]:
diagnoses = [
    "congestive heart failure",
    "fluid and electrolyte disorders",
    "liver disease",
    "rheumatoid arthritis/collagen vascular diseases",
   # "AIDS/HIV",
    "alcohol abuse",
    "blood loss anemia",
    "cardiac arrhythmia",
    "chronic pulmonary disease",
    "coagulopathy",
    "deficiency anemia",
    "depression",
    "diabetes complicated",
    "diabetes uncomplicated",
    "drug abuse",
    "hypertension complicated",
    "hypertension uncomplicated",
    "lymphoma",
    "metastatic cancer",
    "obesity",
    "other neurological disorders",
    "paralysis",
    "peptic ulcer disease excluding bleeding",
    "peripheral vascular disorders",
    "psychoses",
    "pulmonary circulation disorders",
    "renal failure",
    "solid tumor without metastasis",
    "valvular disease",
    "weight loss"
]

In [None]:
for cond in diagnoses:
    with Timer() as t:
        condition = get_condition(Diagnosis, cond, ['ICD-10', 'ICD-9'])
        preconditions[condition._label] = heart_surgery_cohort.has_precondition(
            name=condition._label,
            condition=condition
        )
        break
    print(f'{condition._label} done in {t.elapsed} s')

# Lab Values

We are extracting different lab test results (values, abnormal and result flags) for blood samples. 

We aggregate them by day for the {third, second and the} day before the procedure.

In [None]:
lab_values = {}

In [None]:
lv_cond = {
    "Blood Urea Nitrogen": LabValue("UREA NITROGEN-BLD"),
    "Blood Creatinine": LabValue("CREATININE-SERUM"),
    "Anion Gap": LabValue("ANION GAP"),
    "Bilirubin": LabValue("BILIRUBIN TOTAL"),
    "Albumin": LabValue("ALBUMIN, BLD"),
    "Chloride": LabValue("CHLORIDE-BLD"),
    "Glucose": LabValue("GLUCOSE"),
    "Hematocrit": LabValue("HEMATOCRIT"),
    "Hemoglobin": LabValue("HEMOGLOBIN"),
    "Platelet Count": LabValue("PLATELET"),
    "Potassium": LabValue("POTASSIUMBLD"),
    "Sodium": LabValue("SODIUM-BLD"),
    "White Blood Cell Count": LabValue("WHITE BLOOD CELL") | LabValue("WBC"),
    "INR": LabValue("INR"),
    "PTT": LabValue("APTT"),
    "PT": LabValue("PRO TIME"),
    "Lactate": LabValue("WB LACTATE-ART (POCT)"),
}

In [None]:
for name, cond in lv_cond.items():
    with Timer() as t:
        lab_values[name] = heart_surgery_cohort.values_for(cond, before=heart_surgery_condition)
    print(f'Fetching {name} done in {t.elapsed} s')

In [None]:
pivoted_values = {}
for name, lab_df in lab_values.items():
    with Timer() as t:
        pivoted_values[name] = heart_surgery_cohort.aggregate_values_in(
            time_windows=((-3,-3), (-2,-2), (-1,0)),
            df=lab_df, 
            aggregation_functions=lv_cond[name].default_aggregations,
            name=name
        )
    print(f'{name} done in {t.elapsed} s')

# VitalSign

To showcase FIBER's data extraction possibilities, we are also extracting oral temperature measurements for the patients in our cohort and aggregate them like the lab values. 

In [None]:
temp_cond = VitalSign("temperature, oral")
df_temp = heart_surgery_cohort.values_for(temp_cond, before=heart_surgery_condition)
df_temp = df_temp[df_temp.unit_of_measure == 'oC']

In [None]:
pivoted_temp = heart_surgery_cohort.aggregate_values_in(
    time_windows=((-3,-3), (-2,-2), (-1,0)),
    df=df_temp, 
    aggregation_functions=temp_cond.default_aggregations,
    name=name
)

# Merging all information

We now merge all previous intermediate results into one big dataframe, that is machine learning ready. 

In [None]:
with Timer() as t:
    df = heart_surgery_cohort.merge_patient_data(
        hospitalization,
        emergency,
        aki,
        stroke,
        *preconditions.values(),
        *pivoted_values.values(),
        pivoted_temp
    )
print(f'Merging all values done in {t.elapsed}')

In [None]:
df.head()

In [None]:
print(df.shape)

In [None]:
# How filled are the columns? 
for c in df.columns:
    print(f'{c}: {1 - df[c].isnull().sum() / len(df[c]):.2f}')

In [None]:
df.info()

In [None]:
df.to_csv("/tmp/heart-surgery.csv")