In [1]:
import pandas as pd
import time
import pyprevent
import jupyter_black
import numpy as np

jupyter_black.load(lab=True)

## Data Setup

In [2]:
test_patient = ("female", 40, 200, 50, 120, True, True, 25, 70, True, True)
test_list = [test_patient for _ in range(10_000_000)]
df = pd.DataFrame(
    test_list,
    columns=[
        "sex",
        "age",
        "total_cholesterol",
        "hdl_cholesterol",
        "systolic_bp",
        "has_diabetes",
        "current_smoker",
        "bmi",
        "egfr",
        "on_htn_meds",
        "on_cholesterol_meds",
    ],
)

In [None]:
df.iloc[0].to_dict()

## Individual patient calculations

Taking the first row in the form of a dict, we have a test patient.

The keys of this dictionary are the input arguments for all of the functions.

Here, we can pass the dictionary as kwargs to return the risk. Of note this float represents the risk as a percentage.

In [None]:
pyprevent.calculate_10_yr_ascvd_risk(**df.iloc[0].to_dict())

(You can also pass in the arguments like a normal python function)

In [None]:
pyprevent.calculate_30_yr_ascvd_risk(
    sex="MALE",
    age=40,
    total_cholesterol=200,
    hdl_cholesterol=50,
    systolic_bp=120,
    has_diabetes=False,
    current_smoker=False,
    bmi=25,
    egfr=70,
    on_htn_meds=False,
    on_cholesterol_meds=False,
)

All of the inputs have constains, and will return an error if the values are outside of the acceptable range.

These ranges are set by the AHA PREVENT formulas to constain any extrapolation at extreme values.

In [None]:
pyprevent.calculate_30_yr_ascvd_risk(
    
    sex="unknown",

    
    age=40,
    total_cholesterol=200,
    hdl_cholesterol=50,
    systolic_bp=120,
    has_diabetes=False,
    current_smoker=False,
    bmi=25,
    egfr=70,
    on_htn_meds=False,
    on_cholesterol_meds=False,
)

In [None]:
pyprevent.calculate_30_yr_ascvd_risk(
    sex="male",
    
    age=10,
    
    total_cholesterol=200,
    hdl_cholesterol=50,
    systolic_bp=120,
    has_diabetes=False,
    current_smoker=False,
    bmi=25,
    egfr=70,
    on_htn_meds=False,
    on_cholesterol_meds=False,
)

## Batch Calculations

There are also functions to batch apply this function.

In our test data set, we have a million patients.

In [None]:
print(len(df))

In [None]:
pyprevent.batch_calculate_10_yr_ascvd_risk(df)

In [None]:
start = time.perf_counter()
pyprevent.batch_calculate_30_yr_ascvd_risk(df)
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

## Data validation

Again, if a value is invalid, the batch methods will return a ValueError.

In [None]:
# Setting age to 100.0 near the end of the dataframe
df.loc[999_998, "age"] = 100.0
print(df.iloc[-2])
pyprevent.batch_calculate_30_yr_ascvd_risk(df)

In [None]:
%%timeit
test_list = [tup for tup in df.itertuples(index=False)]
pyprevent._pyprevent.calculate_10_yr_ascvd_rust_parallel(data=test_list)

In [None]:
start = time.perf_counter()
test_df = df.copy()
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

In [None]:
start = time.perf_counter()
test_df["sex"] = test_df["sex"].str.lower().map({"female": 0, "male": 1})
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

In [None]:
start = time.perf_counter()
test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
] = test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
].astype(
    int
)
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

In [None]:
start = time.perf_counter()
data = test_df.values.astype(np.float64)
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

In [None]:
start = time.perf_counter()
r = pyprevent._pyprevent.calculate_10_yr_ascvd_rust_parallel_np(data=data)
end = time.perf_counter()
duration = end - start
print(f"Time to run 1 million rows: {duration} seconds")

In [None]:
%%timeit
test_df = df.copy()
test_df["sex"] = test_df["sex"].str.lower().map({"female": 0, "male": 1})
test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
] = test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
].astype(
    int
)
data = test_df.values.astype(np.float64)
r = pyprevent._pyprevent.calculate_10_yr_ascvd_rust_parallel_np(data=data)

In [None]:
%%timeit
test_df = df.copy()
test_df["sex"] = test_df["sex"].str.lower().map({"female": 0, "male": 1})
data = test_df.values.astype(np.float64)
r = pyprevent._pyprevent.calculate_10_yr_ascvd_rust_parallel_np(data=data)

In [3]:
import math


def calculate_10_yr_ascvd_risk_py(
    sex,
    age,
    total_cholesterol,
    hdl_cholesterol,
    systolic_bp,
    has_diabetes,
    current_smoker,
    bmi,
    egfr,
    on_htn_meds,
    on_cholesterol_meds,
):
    # Input validation
    if not 30.0 <= age <= 79.0:
        return "Age must be between 30 and 79"
    if not 130.0 <= total_cholesterol <= 320.0:
        return "Total cholesterol must be between 130 and 320"
    if not 20.0 <= hdl_cholesterol <= 100.0:
        return "HDL cholesterol must be between 20 and 100"
    if not 90.0 <= systolic_bp <= 200.0:
        return "Systolic blood pressure must be between 90 and 200"
    if not 18.5 <= bmi <= 39.9:
        return "BMI must be between 18.5 and 39.9"
    if not 15.0 <= egfr <= 140.0:
        return "eGFR must be between 15 and 140"

    cholesterol_diff = total_cholesterol - hdl_cholesterol
    age_adjusted = (age - 55.0) / 10.0

    if sex.lower() == "female":
        diabetes_factor = 0.8348585 if has_diabetes else 0.0
        smoker_factor = 0.4831078 if current_smoker else 0.0
        htn_meds_factor = 0.2265309 if on_htn_meds else 0.0
        htn_cholesterol_treatment_factor = 0.0592374 if on_cholesterol_meds else 0.0
        systolic_bp_adjusted_max = (max(systolic_bp, 110.0) - 130.0) / 20.0
        cholesterol_diff_factor = 0.02586 * cholesterol_diff - 3.5

        diabetes_age_factor = 0.2417542 * age_adjusted if has_diabetes else 0.0
        smoker_age_factor = 0.0791142 * age_adjusted if current_smoker else 0.0

        calculation = (
            0.719883 * age_adjusted
            - 3.819975
            + 0.1176967 * cholesterol_diff_factor
            - 0.151185 * (0.02586 * hdl_cholesterol - 1.3) / 0.3
            - 0.0835358 * (min(systolic_bp, 110.0) - 110.0) / 20.0
            + 0.3592852 * systolic_bp_adjusted_max
            + diabetes_factor
            + smoker_factor
            + 0.4864619 * (min(egfr, 60.0) - 60.0) / -15.0
            + 0.0397779 * (max(egfr, 60.0) - 90.0) / -15.0
            + htn_meds_factor
            - htn_cholesterol_treatment_factor
            - (0.0395762 * systolic_bp_adjusted_max if on_htn_meds else 0.0)
            + (0.0844423 * cholesterol_diff_factor if on_cholesterol_meds else 0.0)
            - 0.0567839 * age_adjusted * cholesterol_diff_factor
            + 0.0325692 * age_adjusted * (0.02586 * hdl_cholesterol - 1.3) / 0.3
            - 0.1035985 * age_adjusted * systolic_bp_adjusted_max
            - diabetes_age_factor
            - smoker_age_factor
            - 0.1671492 * age_adjusted * (min(egfr, 60.0) - 60.0) / -15.0
        )

    elif sex.lower() == "male":
        diabetes_factor = 0.7189597 if has_diabetes else 0.0
        smoker_factor = 0.3956973 if current_smoker else 0.0
        htn_meds_factor = 0.2036522 if on_htn_meds else 0.0
        htn_cholesterol_treatment_factor = 0.0865581 if on_cholesterol_meds else 0.0
        systolic_bp_adjusted_max = (max(systolic_bp, 110.0) - 130.0) / 20.0
        cholesterol_diff_factor = 0.02586 * cholesterol_diff - 3.5

        diabetes_age_factor = 0.2018525 * age_adjusted if has_diabetes else 0.0
        smoker_age_factor = 0.0970527 * age_adjusted if current_smoker else 0.0

        calculation = (
            0.7099847 * age_adjusted
            - 3.500655
            + 0.1658663 * cholesterol_diff_factor
            - 0.1144285 * (0.02586 * hdl_cholesterol - 1.3) / 0.3
            - 0.2837212 * (min(systolic_bp, 110.0) - 110.0) / 20.0
            + 0.3239977 * systolic_bp_adjusted_max
            + diabetes_factor
            + smoker_factor
            + 0.3690075 * (min(egfr, 60.0) - 60.0) / -15.0
            + 0.0203619 * (max(egfr, 60.0) - 90.0) / -15.0
            + htn_meds_factor
            - htn_cholesterol_treatment_factor
            - (0.0322916 * systolic_bp_adjusted_max if on_htn_meds else 0.0)
            + (0.114563 * cholesterol_diff_factor if on_cholesterol_meds else 0.0)
            - 0.0300005 * age_adjusted * cholesterol_diff_factor
            + 0.0232747 * age_adjusted * (0.02586 * hdl_cholesterol - 1.3) / 0.3
            - 0.0927024 * age_adjusted * systolic_bp_adjusted_max
            - diabetes_age_factor
            - smoker_age_factor
            - 0.1217081 * age_adjusted * (min(egfr, 60.0) - 60.0) / -15.0
        )

    else:
        return "Sex must be either 'male' or 'female'."

    risk_score = math.exp(calculation) / (1.0 + math.exp(calculation)) * 100.0
    return risk_score

In [4]:
start = time.perf_counter()
for row in df.itertuples(index=False):
    calculate_10_yr_ascvd_risk_py(*row)
end = time.perf_counter()
py_duration = end - start
print(f"Time to run Pure Python Implementation: {py_duration} seconds")

Time to run Pure Python Implementation: 18.628560791985365 seconds


In [5]:
start = time.perf_counter()
pyprevent.batch_calculate_30_yr_ascvd_risk(df)
end = time.perf_counter()
current_duration = end - start
print(f"Time to run Current Implementation: {current_duration} seconds")

Time to run Current Implementation: 8.709082334011327 seconds


In [6]:
start = time.perf_counter()
test_df = df.copy()
test_df["sex"] = test_df["sex"].str.lower().map({"female": 0, "male": 1})
test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
] = test_df[
    ["has_diabetes", "current_smoker", "on_htn_meds", "on_cholesterol_meds"]
].astype(
    int
)
data = test_df.values.astype(np.float64)
r = pyprevent._pyprevent.calculate_10_yr_ascvd_rust_parallel_np(data=data)
end = time.perf_counter()
rust_parallel = end - start
print(f"Time to run Rust Parallel Implementation: {rust_parallel} seconds")

Time to run Rust Parallel Implementation: 2.570569791016169 seconds


In [7]:
py_duration / rust_parallel

7.246860543172154