In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from fiber import Cohort, INDEX
from fiber.utils import Timer
from fiber.condition import Procedure, MRNs, LabValue, VitalSign, Diagnosis, Drug
from fiber.extensions import BINARY_PIVOT_CONFIG

In [None]:
heart_surgery_condition = (
    Procedure(code='35.%', context='ICD-9').age(min_age=18) | 
    Procedure(code='36.1%', context='ICD-9').age(min_age=18)
)
hs_cohort = Cohort(heart_surgery_condition)

In [None]:
df = hs_cohort.get_occurrences(heart_surgery_condition)

In [None]:
df.sort_values(INDEX, inplace=True)

# Unsupervised test

In [None]:
benchmark_results = {}

In [None]:
%%capture cap --no-stderr
for limit in [100, 500, 1000, 5000, 10000, 15000, 20000, 25000]:
    print(f'################    {limit}    ################')
    mrn_cond = MRNs(mrns=df[:limit])
    mrn_cohort = Cohort(mrn_cond)
    with Timer() as t:
        results = mrn_cohort.get_pivoted_features(window=[-10,10])
    benchmark_results[str(limit)] = [results.copy(), t.elapsed]

In [None]:
# reproducible cell output, even if you reload the notebook (after any time)
cap.show()

In [None]:
measurements = []
shapes = []
size = []
for limit, r in benchmark_results.items():
    shapes.append(r[0].shape)
    measurements.append((r[0].shape[0], r[1]))
    size.append((r[0].shape[0], r[0].memory_usage(index=True, deep=True).sum()))

In [None]:
runtimes = pd.DataFrame(measurements, columns=['# occurrences', 'runtime in s'])
features = pd.DataFrame(shapes, columns=['# occurrences', '# features'])
sizes = pd.DataFrame(size, columns=['# occurrences', 'memory consumption'])

In [None]:
features.plot.line(x='# occurrences', y='# features')

In [None]:
runtimes.plot.line(x='# occurrences', y='runtime in s')

In [None]:
sizes.plot.line(x='# occurrences', y='memory consumption')