In [None]:
import pandas as pd
from fiber.utils import Timer
from fiber.database.hana import engine as hana_engine
from fiber.database.mysql import engine as mysql_engine
from fiber.condition import LabValue

In [None]:
import json
import requests

def slack_notification(text):
    webhook_url = 'https://hooks.slack.com/services/xxxxxxxxxxxxxxxxxxxxx'
    slack_data = {'text': text}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

In [None]:
def plot_line(df, x_col):
    return df.plot.line(x=x_col, y='Runtime in s')

## `Procedure('35.%', 'ICD-9') | Procedure('36.1%', 'ICD-9')`

In [None]:
def build_procedure_query(limit):
    hana_query = """
        SELECT DISTINCT "MSDW_2018"."D_PERSON"."MEDICAL_RECORD_NUMBER"
        FROM "MSDW_2018"."FACT"
            JOIN "MSDW_2018"."D_PERSON" ON "MSDW_2018"."FACT"."PERSON_KEY" = "MSDW_2018"."D_PERSON"."PERSON_KEY"
            JOIN "MSDW_2018"."B_PROCEDURE" ON "MSDW_2018"."FACT"."PROCEDURE_GROUP_KEY" = "MSDW_2018"."B_PROCEDURE"."PROCEDURE_GROUP_KEY"
            JOIN "MSDW_2018"."FD_PROCEDURE" ON "MSDW_2018"."FD_PROCEDURE"."PROCEDURE_KEY" = "MSDW_2018"."B_PROCEDURE"."PROCEDURE_KEY"
        WHERE "MSDW_2018"."FACT"."AGE_IN_DAYS" >= 6570
          AND "MSDW_2018"."FD_PROCEDURE"."CONTEXT_NAME" LIKE 'ICD-9'
          AND upper("MSDW_2018"."FD_PROCEDURE"."CONTEXT_PROCEDURE_CODE") LIKE '35.%'
          OR "MSDW_2018"."FACT"."AGE_IN_DAYS" >= 6570
          AND "MSDW_2018"."FD_PROCEDURE"."CONTEXT_NAME" LIKE 'ICD-9'
          AND upper("MSDW_2018"."FD_PROCEDURE"."CONTEXT_PROCEDURE_CODE") LIKE '36.1%'
        LIMIT 
    """ + str(limit)

    mysql_query = """
        SELECT DISTINCT `D_PERSON`.`MEDICAL_RECORD_NUMBER`
        FROM `FACT`
            INNER JOIN `D_PERSON` ON `FACT`.`PERSON_KEY` = `D_PERSON`.`PERSON_KEY`
            INNER JOIN `B_PROCEDURE` ON `FACT`.`PROCEDURE_GROUP_KEY` = `B_PROCEDURE`.`PROCEDURE_GROUP_KEY`
            INNER JOIN `FD_PROCEDURE` ON `FD_PROCEDURE`.`PROCEDURE_KEY` = `B_PROCEDURE`.`PROCEDURE_KEY`
        WHERE `FACT`.`AGE_IN_DAYS` >= 6570
          AND `FD_PROCEDURE`.`CONTEXT_NAME` LIKE 'ICD-9'
          AND upper(`FD_PROCEDURE`.`CONTEXT_PROCEDURE_CODE`) LIKE '35.%%'
          OR `FACT`.`AGE_IN_DAYS` >= 6570
          AND `FD_PROCEDURE`.`CONTEXT_NAME` LIKE 'ICD-9'
          AND upper(`FD_PROCEDURE`.`CONTEXT_PROCEDURE_CODE`) LIKE '36.1%%'
        LIMIT 
    """ + str(limit)
    return (hana_query, mysql_query)

In [None]:
def execute_procedure_benchmark(limits, procedure_query_func):
    hana_benchmark_results = []
    mysql_benchmark_results = []
    for limit in limits:
        queries = procedure_query_func(limit)
        with Timer() as t:
            pd.read_sql(queries[0], hana_engine)
        hana_benchmark_results.append((str(limit), t.elapsed))
        with Timer() as t:
            pd.read_sql(queries[1], mysql_engine)
        mysql_benchmark_results.append((str(limit), t.elapsed))
        
        slack_notification(f'Sup! Done {str(limit)} medical record numbers')
    return (
        pd.DataFrame(hana_benchmark_results, columns=['# Patients', 'runtime in s']),
        pd.DataFrame(mysql_benchmark_results, columns=['# Patients', 'runtime in s'])
    )

In [None]:
limits = [10, 100, 500, 1000, 5000, 10000, 15000]
hana_procedure_results, mysql_procedure_results = execute_procedure_benchmark(limits, build_procedure_query)

hana_procedure_results.to_csv('/home/martet02/benchmarks/hana/procedure.csv')
mysql_procedure_results.to_csv('/home/martet02/benchmarks/mysql/procedure.csv')

In [None]:
plot_line(hana_procedure_results, '# Patients')

In [None]:
plot_line(mysql_procedure_results, '# Patients')

## `LabValue('%Glucose%').patients_per('TEST_NAME')`

In [None]:
def build_lab_value_query(limit):
    hana_query = """
        SELECT DISTINCT TEST_NAME,
            count(DISTINCT "EPIC_LAB"."MEDICAL_RECORD_NUMBER") AS patients
        FROM "MSDW_2018"."EPIC_LAB"
        WHERE upper("EPIC_LAB"."TEST_NAME") LIKE '%GLUCOSE%'
            AND "EPIC_LAB"."ID" < """ + str(limit) + """
        GROUP BY TEST_NAME
        ORDER BY patients DESC
    """

    mysql_query = """
        SELECT DISTINCT TEST_NAME,
            count(DISTINCT `EPIC_LAB`.`MEDICAL_RECORD_NUMBER`) AS patients
        FROM `EPIC_LAB`
        WHERE upper(`EPIC_LAB`.`TEST_NAME`) LIKE '%%GLUCOSE%%' 
            AND `EPIC_LAB`.`ID` < """ + str(limit) + """
        GROUP BY `TEST_NAME`
        ORDER BY patients DESC
    """
    return (hana_query, mysql_query)

In [None]:
def execute_labvalue_benchmark(limits, labvalue_query_func):
    hana_benchmark_results = []
    mysql_benchmark_results = []
    for limit in limits:
        queries = labvalue_query_func(limit)
        with Timer() as t:
            pd.read_sql(queries[0], hana_engine)
        hana_benchmark_results.append((str(limit), t.elapsed))
        with Timer() as t:
            pd.read_sql(queries[1], mysql_engine)
        mysql_benchmark_results.append((str(limit), t.elapsed))

        slack_notification(f'Sup! Done {str(limit)} lab values')

    return (
        pd.DataFrame(hana_benchmark_results, columns=['# LabTests', 'Runtime in s']),
        pd.DataFrame(mysql_benchmark_results, columns=['# LabTests', 'Runtime in s'])
    )

In [None]:
%%capture cap
limits = [10, 100, 1000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]
hana_labvalue_results, mysql_labvalue_results = execute_labvalue_benchmark(limits, build_lab_value_query)

hana_labvalue_results.to_csv('/path/to/benchmarks/hana/labvalue.csv')
mysql_labvalue_results.to_csv('/path/to/benchmarks/mysql/labvalue.csv')

In [None]:
plot_line(hana_labvalue_results, '# LabTests')

In [None]:
plot_line(mysql_labvalue_results, '# LabTests')