In [None]:
import json
import requests

import pandas as pd

from fiber.utils import Timer
from fiber.cohort import Cohort
from fiber.condition import Procedure, Diagnosis
from fiber.database import read_with_progress
from fiber.database.hana import engine as hana_engine
from fiber.database.mysql import engine as mysql_engine

In [None]:
def plot_line(df, x_col):
    return df.plot.line(x=x_col, y='Runtime in s')

In [None]:
def slack_notification(text):
    webhook_url = 'https://hooks.slack.com/services/xxxxxxxxxxxxxxxxxxx'
    slack_data = {'text': text}

    response = requests.post(
        webhook_url, data=json.dumps(slack_data),
        headers={'Content-Type': 'application/json'}
    )
    if response.status_code != 200:
        raise ValueError(
            'Request to slack returned an error %s, the response is:\n%s'
            % (response.status_code, response.text)
        )

In [None]:
sample_cohort = Cohort(Procedure('35.%', 'ICD-9') | Procedure('36.1%', 'ICD-9'))
mrns_ = sample_cohort.mrns()

## `sample_cohort.values_for(Diagnosis('584.9', 'ICD-9'))`

In [None]:
def value_fetching_query(mrns, limit):
    mrn_query = '('
    for p in list(mrns)[0:limit]:
        mrn_query += "'" + p + "',"
    mrn_query = mrn_query[:-1] + ')'

    hana_query = """
        SELECT DISTINCT D_PERSON.MEDICAL_RECORD_NUMBER, FACT.AGE_IN_DAYS, FD_DIAGNOSIS.CONTEXT_NAME, FD_DIAGNOSIS.CONTEXT_DIAGNOSIS_CODE 
        FROM "MSDW_2018"."FACT" 
            JOIN "MSDW_2018"."D_PERSON" ON "MSDW_2018"."FACT"."PERSON_KEY" = "MSDW_2018"."D_PERSON"."PERSON_KEY" 
            JOIN "MSDW_2018"."B_DIAGNOSIS" ON "MSDW_2018"."FACT"."DIAGNOSIS_GROUP_KEY" = "MSDW_2018"."B_DIAGNOSIS"."DIAGNOSIS_GROUP_KEY" 
            JOIN "MSDW_2018"."FD_DIAGNOSIS" ON "MSDW_2018"."FD_DIAGNOSIS"."DIAGNOSIS_KEY" = "MSDW_2018"."B_DIAGNOSIS"."DIAGNOSIS_KEY" 
        WHERE "MSDW_2018"."FD_DIAGNOSIS"."CONTEXT_NAME" LIKE 'ICD-9' 
            AND upper("MSDW_2018"."FD_DIAGNOSIS"."CONTEXT_DIAGNOSIS_CODE") LIKE '584.9' 
            AND "MSDW_2018"."D_PERSON"."MEDICAL_RECORD_NUMBER" IN 
        """ + mrn_query

    mysql_query = """
        SELECT DISTINCT `D_PERSON`.`MEDICAL_RECORD_NUMBER`, `FACT`.`AGE_IN_DAYS`, `FD_DIAGNOSIS`.`CONTEXT_NAME`, `FD_DIAGNOSIS`.`CONTEXT_DIAGNOSIS_CODE` 
        FROM `FACT` 
            INNER JOIN `D_PERSON` ON `FACT`.`PERSON_KEY` = `D_PERSON`.`PERSON_KEY` 
            INNER JOIN `B_DIAGNOSIS` ON `FACT`.`DIAGNOSIS_GROUP_KEY` = `B_DIAGNOSIS`.`DIAGNOSIS_GROUP_KEY` 
            INNER JOIN `FD_DIAGNOSIS` ON `FD_DIAGNOSIS`.`DIAGNOSIS_KEY` = `B_DIAGNOSIS`.`DIAGNOSIS_KEY`
        WHERE `FD_DIAGNOSIS`.`CONTEXT_NAME` LIKE 'ICD-9' 
            AND upper(`FD_DIAGNOSIS`.`CONTEXT_DIAGNOSIS_CODE`) LIKE '584.9' 
            AND `D_PERSON`.`MEDICAL_RECORD_NUMBER` IN 
        """ + mrn_query
    
    return hana_query, mysql_query

In [None]:
def execute_values_benchmark(mrns, limits, value_fetching_query_func):
    hana_benchmark_results = []
    mysql_benchmark_results = []
    sizes = []
    for limit in limits:
        queries = value_fetching_query_func(mrns, limit)
        with Timer() as t:
            df = read_with_progress(queries[0], hana_engine, silent=True)
        sizes.append((str(limit), len(df)))
        hana_benchmark_results.append((str(limit), t.elapsed))
        with Timer() as t:
            read_with_progress(queries[1], mysql_engine, silent=True)
        mysql_benchmark_results.append((str(limit), t.elapsed))

        slack_notification(f'Sup! Done value fetching for {str(limit)} MRNs')

    return (
        pd.DataFrame(hana_benchmark_results, columns=['# Patients', 'Runtime in s']),
        pd.DataFrame(mysql_benchmark_results, columns=['# Patients', 'Runtime in s']),
        pd.DataFrame(sizes, columns=['# Patients', '# Rows'])
    )

In [None]:
limits = [10, 100, 500, 1000, 5000, 10000, 15000]
hana_value_fetching_results, mysql_value_fetching_results, sizes = execute_values_benchmark(mrns_, limits, value_fetching_query)

hana_value_fetching_results.to_csv('/path/to/benchmarks/hana/value_fetching.csv')
mysql_value_fetching_results.to_csv('/path/to/benchmarks/mysql/value_fetching.csv')
sizes.to_csv('/path/to/benchmarks/sizes_value_fetching.csv')

In [None]:
sizes.plot.line(x='# Patients', y='# Rows')

In [None]:
mysql_value_fetching_results.plot.line(x='# Patients', y='Runtime in s')

In [None]:
hana_value_fetching_results.plot.line(x='# Patients', y='Runtime in s')