In [None]:
import json
import pickle
import requests

import matplotlib.pyplot as plt
import pandas as pd

from fiber import OCCURRENCE_INDEX
from fiber.utils import Timer
from fiber.cohort import Cohort
from fiber.condition import MRNs, Diagnosis
from fiber.extensions import BINARY_PIVOT_CONFIG
from fiber.storage import yaml as fiber_yaml

In [None]:
def slack_notification(i):
    webhook_url = 'https://hooks.slack.com/services/xxxx/yyyy'
    slack_data = {'text': f'Done {str(i)} occurrences'}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

# Global Feature Extraction

This notebook executes the benchmark for the feature extraction for unsupervised machine learning.
In this example, we create a cohort with patients that are diagnosed with complicated hypertension.

The benchmark is run for up to 50,000 condition occurrences and reports the total runtimes, in-memory size of the final dataframe as well as the number of columns in the results.

In [None]:
hypertension_cond = fiber_yaml.get_condition(
    Diagnosis, 
    'hypertension complicated', 
    coding_schemes=['ICD-10']
)
hypertension_cohort = Cohort(hypertension_cond)

In [None]:
occurrences = hypertension_cohort.get_occurrences(hypertension_cohort.condition)

In [None]:
occurrences.sort_values(OCCURRENCE_INDEX, inplace=True)

## Benchmark

In [None]:
total_times = {}
shapes = []
size = []

for limit in [100, 500, 1000, 5000, 10_000, 15_000, 20_000, 25_000, 30_000, 50_000]:
    mrn_cond = MRNs(mrns=occurrences[:limit])
    mrn_cohort = Cohort(mrn_cond)
    with Timer('Total time: ') as t:
        results = mrn_cohort.get_pivoted_features(pivot_config=BINARY_PIVOT_CONFIG, window=[-50, 50])
    shapes.append(results.shape)
    total_times[limit] = t.elapsed
    size.append([limit, results.memory_usage(index=True, deep=True).sum()])
    slack_notification(limit)
slack_notification('done')

### Result Persisting

In [None]:
runtimes = pd.DataFrame(list(total_times.items()), columns=['# Occurrences', 'Runtime in s'])
features = pd.DataFrame(shapes, columns=['# Occurrences', '# Features'])
sizes = pd.DataFrame(size, columns=['# Occurrences', 'Memory Consumption'])

In [None]:
runtimes.to_csv('../results/unsupervised/runtimes.csv', index=False)
features.to_csv('../results/unsupervised/features.csv', index=False)
sizes.to_csv('../results/unsupervised/sizes.csv', index=False)

### Visualization

In [None]:
features.plot.line(x='# Occurrences', y='# Features')

In [None]:
runtimes.plot.line(x='# Occurrences', y='Runtime in s')

In [None]:
sizes.plot.line(x='# Occurrences', y='Memory Consumption')