In [None]:
import json
import requests

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from fiber.database.hana import engine as hana_engine
from fiber.database.mysql import engine as mysql_engine
from fiber.utils import Timer

In [None]:
def slack_notification(text):
    webhook_url = 'https://hooks.slack.com/services/xxxx/yyyy'
    slack_data = {'text': text}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

# `LabValue('%Glucose%').patients_per('TEST_NAME')`

This notebook executes the benchmark for the exploration of a condition.
In this example laboratory tests that have a name similar to GLUCOSE are fetched with the number of patients that they were executed for.

The queries in `build_query` emulate FIBER's translation process.
However, they have a possibility to limit the number of included lab results, which should control the result size.

The benchmark is run for up to 100,000,000 lab results and reports the execution and fetching time of the queries on HANA and MySQL.

In [None]:
def build_query(limit):
    hana_query = """
        SELECT DISTINCT TEST_NAME,
            count(DISTINCT "EPIC_LAB"."MEDICAL_RECORD_NUMBER") AS patients
        FROM "MSDW_2018"."EPIC_LAB"
        WHERE upper("EPIC_LAB"."TEST_NAME") LIKE '%GLUCOSE%'
            AND "EPIC_LAB"."ID" < """ + str(limit) + """
        GROUP BY TEST_NAME
        ORDER BY patients DESC
    """

    mysql_query = """
        SELECT DISTINCT TEST_NAME,
            count(DISTINCT `EPIC_LAB`.`MEDICAL_RECORD_NUMBER`) AS patients
        FROM `EPIC_LAB`
        WHERE upper(`EPIC_LAB`.`TEST_NAME`) LIKE '%%GLUCOSE%%' 
            AND `EPIC_LAB`.`ID` < """ + str(limit) + """
        GROUP BY `TEST_NAME`
        ORDER BY patients DESC
    """
    return hana_query, mysql_query

In [None]:
def execute_benchmark(limits, query_builder):
    hana_benchmark_results = []
    mysql_benchmark_results = []
    for limit in limits:
        queries = query_builder(limit)
        with Timer() as t:
            pd.read_sql(queries[0], hana_engine)
        hana_benchmark_results.append([limit, t.elapsed])
        with Timer() as t:
            pd.read_sql(queries[1], mysql_engine)
        mysql_benchmark_results.append([limit, t.elapsed])

        slack_notification(f'Sup! Done {str(limit)} lab values')

    return (
        pd.DataFrame(hana_benchmark_results, columns=['# LabTests', 'Runtime in s']),
        pd.DataFrame(mysql_benchmark_results, columns=['# LabTests', 'Runtime in s'])
    )

In [None]:
limits = [10, 100, 1000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]
hana_results, mysql_results = execute_benchmark(limits, build_query)

### Result Persisting

In [None]:
hana_results.to_csv('../results/exploration/hana.csv', index=False)
mysql_results.to_csv('../results/exploration/mysql.csv', index=False)

In [None]:
hana_results = pd.read_csv('../results/exploration/hana.csv')
mysql_results = pd.read_csv('../results/exploration/mysql.csv')

### Visualization

In [None]:
hana_results.plot.line(x='# LabTests', y='Runtime in s', logy=1, logx=1)

In [None]:
mysql_results.plot.line(x='# LabTests', y='Runtime in s', logy=1, logx=1)

In [None]:
results = pd.merge(mysql_results, hana_results, on='# LabTests')
results.rename(columns={'Runtime in s_x': 'MySQL Runtime in s', 'Runtime in s_y': 'IMDB Runtime in s'}, inplace=True)

plt.figure()

plt.plot(results['# LabTests'], results['MySQL Runtime in s'], '--', linewidth=2, markersize=12, label='MySQL Runtime in s')
plt.plot(results['# LabTests'], results['IMDB Runtime in s'], '-', linewidth=2, markersize=12, label='IMDB Runtime in s')
plt.yscale('log')
plt.xscale('log')
plt.ylabel('Runtime in s')
plt.xlabel('# Lab Tests')
plt.xlim(10)
plt.legend()
plt.title('Exploratory Grouping of Lab Tests')
plt.savefig('../figures/exploration/runtime.png', dpi=600, bbox_inches="tight")