In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from fiber import OCCURRENCE_INDEX
from fiber.utils import Timer
from fiber.cohort import Cohort
from fiber.condition import MRNs, Diagnosis
from fiber.extensions import BINARY_PIVOT_CONFIG
from fiber.storage import yaml as fiber_yaml

# Preparations

In [None]:
hypertension_cond = fiber_yaml.get_condition(
    Diagnosis, 
    'hypertension complicated', 
    coding_schemes=['ICD-10']
)
hypertension_cohort = Cohort(hypertension_cond)

In [None]:
occurrences = hypertension_cohort.get_occurrences(hypertension_cohort.condition)

In [None]:
occurrences.sort_values(OCCURRENCE_INDEX, inplace=True)

# Utils

In [None]:
import json
import requests

def slack_notification(i):
    webhook_url = 'https://hooks.slack.com/services/xxxxxxxxxxxxxxxx'
    slack_data = {'text': f'Sup! Done {str(i)} occurrences'}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

# Benchmark

In [None]:
benchmark_results = {}
measurements = []
shapes = []
size = []

In [None]:
%%capture cap
for limit in [100, 500, 1000, 5000, 10000, 15000, 20000, 25000, 30000, 50000]:
    print(f'################    {limit}    ################')
    mrn_cond = MRNs(mrns=occurrences[:limit])
    mrn_cohort = Cohort(mrn_cond)
    print('Starting ...')
    with Timer('Total time: ') as t:
        try:
            results = mrn_cohort.get_pivoted_features(pivot_config=BINARY_PIVOT_CONFIG, window=[-50, 50])
        except Exception as e:
            print(e)
            traceback.print_exc()
    print('Done ...')
    print('Shape: ', results.shape)
    print('Time elapsed', t.elapsed)
    print('Size', results.memory_usage(index=True, deep=True).sum())
    shapes.append(results.shape)
    measurements.append((str(limit), t.elapsed))
    size.append((str(limit), results.memory_usage(index=True, deep=True).sum()))
slack_notification(limit)

In [None]:
# reproducible cell output, even if you reload the notebook (after any time)
cap.show()

In [None]:
runtimes = pd.DataFrame(measurements, columns=['# occurrences', 'runtime in s'])
features = pd.DataFrame(shapes, columns=['# occurrences', '# features'])
sizes = pd.DataFrame(size, columns=['# occurrences', 'memory consumption'])

In [None]:
import pickle

In [None]:
pickle.dump({
    "runtimes": runtimes, 
    "features": features, 
    "sizes": sizes
}, open("/path/to/unsupervised.pkl", 'wb'))

In [None]:
features.plot.line(x='# occurrences', y='# features')

In [None]:
runtimes.plot.line(x='# occurrences', y='runtime in s')

In [None]:
sizes.plot.line(x='# occurrences', y='memory consumption')

In [None]:
slack_notification('done')