# Top Spending Cohorts

In [42]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit

# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_top_spending_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on historical (up to last 24-months) spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by spend, containing columns:
        cohort_id, cohort_name, spend
    """

    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract historical (up to last 24-months) data
    previous_24_months_to_current_date = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            pl.sum("claim_amount").alias("spend")
        )
        .sort(pl.col("spend"), descending=True, nulls_last=True)
        .limit(number_of_rows)
    )

    # Collect to materialize results
    return previous_24_months_to_current_date

In [48]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    # print(f"Cohort Diseases ICD Level 1: {compute_top_spending_cohorts(get_cohort_diseases_icd_level_1('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")
    # print(f"Cohort Diseases ICD Level 2: {compute_top_spending_cohorts(get_cohort_diseases_icd_level_2('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")
    # print(f"Cohort Diseases Chronic: {compute_top_spending_cohorts(get_cohort_diseases_chronic('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_top_spending_cohorts(get_cohort_diseases_trigger_level_2('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")


Cohort Diseases Trigger Level 2: shape: (10, 2)
┌───────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│ cohort_name                                                                               ┆     spend │
│ ---                                                                                       ┆       --- │
│ str                                                                                       ┆       f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════════╪═══════════╡
│ Malignant neoplasms                                                                       ┆ 835179.42 │
│ Osteoarthritis                                                                            ┆ 330809.30 │
│ Intracranial hemorrhage of newborn                                                        ┆ 296877.87 │
│ Other forms of heart disease                                                              ┆ 262405.77 

In [51]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_top_spending_cohorts(get_cohort_procedures('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")


Cohort Procedures: shape: (10, 2)
┌──────────────────────────────────────────────────┬────────────┐
│ cohort_name                                      ┆      spend │
│ ---                                              ┆        --- │
│ str                                              ┆        f64 │
╞══════════════════════════════════════════════════╪════════════╡
│ Medical Care                                     ┆ 7003495.89 │
│ *                                                ┆ 1081121.21 │
│ Surgery                                          ┆  622344.20 │
│ Lump Sum Purchase of DME, Prosthetics, Orthotics ┆  334498.46 │
│ Ambulance                                        ┆  277621.42 │
│ Outpatient Mental Health Treatment Limitation    ┆  276381.25 │
│ Diagnostic Laboratory                            ┆  257553.67 │
│ Anesthesia                                       ┆  201888.25 │
│ Vision Items or Services                         ┆  157277.85 │
│ Diagnostic Radiology                    

In [52]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_top_spending_cohorts(get_cohort_facilities('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")

Cohort Facilities: shape: (10, 2)
┌───────────────────────────────┬────────────┐
│ cohort_name                   ┆      spend │
│ ---                           ┆        --- │
│ str                           ┆        f64 │
╞═══════════════════════════════╪════════════╡
│ Inpatient Hospital            ┆ 2692243.47 │
│ Office                        ┆ 2192130.49 │
│ Ambulatory Surgical Center    ┆ 1975156.61 │
│ On Campus-Outpatient Hospital ┆ 1700201.71 │
│ Emergency Room  Hospital      ┆  896815.16 │
│ Home                          ┆  329923.65 │
│ Ambulance - Air or Water      ┆  276301.86 │
│ Independent Laboratory        ┆  131550.76 │
│ Urgent Care Facility          ┆  108035.00 │
│ ESRD Treatment Facility       ┆  104126.93 │
└───────────────────────────────┴────────────┘


In [54]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_top_spending_cohorts(get_cohort_drugs_usage('PS').collect(), '2025-04-30', number_of_rows=10).collect()}")

Cohort Facilities: shape: (10, 2)
┌───────────────────────────────────────────────────┬────────────┐
│ cohort_name                                       ┆      spend │
│ ---                                               ┆        --- │
│ str                                               ┆        f64 │
╞═══════════════════════════════════════════════════╪════════════╡
│ Immunosuppressants                                ┆ 2312603.13 │
│ Drugs Used In Diabetes                            ┆  590375.15 │
│ Other Nervous System Drugs                        ┆  187774.15 │
│ Drugs For Obstructive Airway Diseases             ┆  121138.09 │
│ Analgesics                                        ┆   78816.68 │
│ Antithrombotic Agents                             ┆   69373.96 │
│ Psycholeptics                                     ┆   66200.28 │
│ Psychoanaleptics                                  ┆   52587.94 │
│ Sex Hormones And Modulators Of The Genital System ┆   44016.52 │
│ Antineoplastic Agents     

# Surge in Spending Cohorts

In [2]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit
# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_outlier_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3, include_infinity_percentage: bool = False) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on current month spend vs. last 24-month average spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).
    include_infinity_percentage : bool
        Whether to include cohorts with infinite percentage increase, observed with the average of historical spend is zero(default is False).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by percent increase, containing columns:
        cohort_id, cohort_name, current_month_spend, avg_24m_spend, pct_increase
    """
    
    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    last_30_days_from_reference_date = (reference_date - timedelta(days=30))
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract current month data
    current_month = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(last_30_days_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            pl.sum("claim_amount").alias("current_month_spend")
        )
        .filter(pl.col("current_month_spend") > 0)
    )

    # 2) Compute 24-month window prior to reference month
    historical_24m = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 24).alias("avg_24m_spend")
        )
        .with_columns(pl.col("avg_24m_spend").fill_null(0.0).fill_nan(0.0))
    )

    # 3) Join current month and historical averages
    if include_infinity_percentage:
        outlier_df = (
            current_month
            .join(historical_24m, on=["cohort_name"], how="left")
            .with_columns(
                pl.when(pl.col("avg_24m_spend") != 0)
                .then(((pl.col("current_month_spend") - pl.col("avg_24m_spend")) / 
                    pl.col("avg_24m_spend")) * 100)
                .otherwise(float("inf"))
                .alias("pct_increase")
            )
            .sort(["pct_increase"], descending=True, nulls_last=True)
            .limit(number_of_rows)
        )
    else:
        outlier_df = (
            current_month
            .join(historical_24m, on=["cohort_name"], how="left")
            .with_columns(
                pl.when(pl.col("avg_24m_spend") != 0)
                .then(((pl.col("current_month_spend") - pl.col("avg_24m_spend")) / 
                    pl.col("avg_24m_spend")) * 100)
                .otherwise(float("inf"))
                .alias("pct_increase")
            )
            .filter(pl.col("pct_increase") != float("inf"))
            .sort(["pct_increase"], descending=True, nulls_last=True)
            .limit(number_of_rows)
        )

    # Collect to materialize results
    return outlier_df

In [21]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    # print(f"Cohort Diseases ICD Level 1: {compute_outlier_cohorts(get_cohort_diseases_icd_level_1('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    # print(f"Cohort Diseases ICD Level 1: {compute_outlier_cohorts(get_cohort_diseases_icd_level_1('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    # print(f"Cohort Diseases ICD Level 2: {compute_outlier_cohorts(get_cohort_diseases_icd_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    # print(f"Cohort Diseases ICD Level 2: {compute_outlier_cohorts(get_cohort_diseases_icd_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    # print(f"Cohort Diseases Chronic: {compute_outlier_cohorts(get_cohort_diseases_chronic('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    # print(f"Cohort Diseases Chronic: {compute_outlier_cohorts(get_cohort_diseases_chronic('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_outlier_cohorts(get_cohort_diseases_trigger_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_outlier_cohorts(get_cohort_diseases_trigger_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")


Cohort Diseases Trigger Level 2: shape: (10, 4)
┌───────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                               ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                                       ┆                 --- ┆           --- ┆          --- │
│ str                                                                                       ┆                 f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Labor and delivery complicated by umbilical cord complications                            ┆             3028.93 ┆        412.26 ┆       634.71 │
│ Certain disorders involving the immune mechanism                    

In [23]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_outlier_cohorts(get_cohort_procedures('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Procedures: {compute_outlier_cohorts(get_cohort_procedures('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Procedures: shape: (10, 4)
┌─────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                                     ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                             ┆                 --- ┆           --- ┆          --- │
│ str                                                             ┆                 f64 ┆           f64 ┆          f64 │
╞═════════════════════════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Other Medical Items or Services                                 ┆             2435.31 ┆       2432.07 ┆         0.13 │
│ Outpatient Mental Health Treatment Limitation                   ┆            11200.29 ┆      11515.89 ┆        -2.74 │
│ Medical Care | Lump Sum Purchase of DME, Prosthetics, Orthotics ┆              146.65 ┆        190.88 ┆       -23.17 

In [24]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_facilities('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_facilities('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                    ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                            ┆                 --- ┆           --- ┆          --- │
│ str                            ┆                 f64 ┆           f64 ┆          f64 │
╞════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Rural Health Clinic            ┆              546.06 ┆        117.47 ┆       364.86 │
│ Unassigned                     ┆             3807.56 ┆       2479.36 ┆        53.57 │
│ Telehealth                     ┆             1990.41 ┆       1508.20 ┆        31.97 │
│ Office                         ┆            51346.19 ┆      91444.81 ┆       -43.85 │
│ Urgent Care Facility           ┆             2340.16 ┆       4501.46 ┆       -48.01 │
│ ESRD Treatment Facility        ┆             2161.40 ┆       4338.62 ┆       -50.18 

In [25]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_drugs_usage('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_drugs_usage('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                           ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                                   ┆                 --- ┆           --- ┆          --- │
│ str                                                                                   ┆                 f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Antianemic PreparationsAntiinflammatory And Antirheumatic ProductsThroat Preparations ┆                2.37 ┆          0.20 ┆      1070.37 │
│ Drugs For Functional Gastrointestinal Disorders                                       ┆               54.9

In [3]:
from cohorts_demographics import get_cohort_demographics_ages, get_cohort_demographics_relationships, get_cohort_demographics_genders

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_ages('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_ages('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_genders('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_genders('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_relationships('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_relationships('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆                 --- ┆           --- ┆          --- │
│ str         ┆                 f64 ┆           f64 ┆          f64 │
╞═════════════╪═════════════════════╪═══════════════╪══════════════╡
│ 27-40       ┆            39565.09 ┆     100778.49 ┆       -60.74 │
│ 19-26       ┆             8247.21 ┆      26932.11 ┆       -69.38 │
│ 65+         ┆             5330.14 ┆      19173.22 ┆       -72.20 │
│ 41-64       ┆            48007.41 ┆     194667.56 ┆       -75.34 │
│ 0-18        ┆            16112.44 ┆      95361.34 ┆       -83.10 │
└─────────────┴─────────────────────┴───────────────┴──────────────┘
Cohort Facilities: shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆                 --- ┆

In [4]:
from cohorts_providers import get_cohort_individual_medical_provider, get_cohort_medical_provider_speciality, get_cohort_individual_rx_provider

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_medical_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_medical_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_rx_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_rx_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_medical_provider_speciality('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_medical_provider_speciality('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name        ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                ┆                 --- ┆           --- ┆          --- │
│ str                ┆                 f64 ┆           f64 ┆          f64 │
╞════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ ANGELA IRELAND     ┆              117.07 ┆          4.88 ┆      2300.00 │
│ MARILOU OH         ┆              114.62 ┆          4.78 ┆      2300.00 │
│ GREGORY GOSEY      ┆              108.11 ┆          4.50 ┆      2300.00 │
│ CHARLES VON BOSE   ┆              628.27 ┆         26.18 ┆      2300.00 │
│ MARSHALL BROWN     ┆              198.32 ┆          8.26 ┆      2300.00 │
│ WILLIAM KOGLER     ┆              199.66 ┆          8.32 ┆      2300.00 │
│ KATHERINE HASTINGS ┆              120.13 ┆          5.01 ┆      2300.00 │
│ MATTHEW YOUNG      ┆              276.28 ┆         1

# Emerging Spending Categories

In [13]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit
# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_emerging_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3, include_infinity_percentage: bool = False) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on last 3 months average spend vs. last 24 months average spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).
    include_infinity_percentage : bool
        Whether to include cohorts with infinite percentage increase, observed with the average of historical spend is zero(default is False).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by percent increase, containing columns:
        cohort_id, cohort_name, last_3m_avg_spend, historical_24m, pct_increase
    """

    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    last_90_days_from_reference_date = (reference_date - timedelta(days=90))
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract last 3 months average data
    last_3m_avg_spend = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(last_90_days_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 3).alias("last_3m_avg_spend")
        )
        .with_columns(pl.col("last_3m_avg_spend").fill_null(0.0).fill_nan(0.0))
        .filter(pl.col("last_3m_avg_spend") > 0)
    )

    # 2) Compute 24-month window prior to reference month
    historical_24m = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") < pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 24).alias("avg_24m_spend")
        )
        .with_columns(pl.col("avg_24m_spend").fill_null(0.0).fill_nan(0.0))
    )

    # 3) Join last 3 months average and historical averages
    if include_infinity_percentage:
        outlier_df = (
        last_3m_avg_spend
        .join(historical_24m, on=["cohort_name"], how="left")
        .with_columns(
            pl.when(pl.col("avg_24m_spend") != 0)
            .then(((pl.col("last_3m_avg_spend") - pl.col("avg_24m_spend")) / 
                pl.col("avg_24m_spend")) * 100)
            .otherwise(float("inf"))
            .alias("pct_increase")
        )
        .sort(["pct_increase", "last_3m_avg_spend"], descending=True, nulls_last=True)
        .limit(number_of_rows)
    )
    else:
        outlier_df = (
        last_3m_avg_spend
        .join(historical_24m, on=["cohort_name"], how="left")
        .with_columns(
            pl.when(pl.col("avg_24m_spend") != 0)
            .then(((pl.col("last_3m_avg_spend") - pl.col("avg_24m_spend")) / 
                pl.col("avg_24m_spend")) * 100)
            .otherwise(float("inf"))
            .alias("pct_increase")
        )
        .filter(pl.col("pct_increase") != float("inf"))
        .sort(["pct_increase", "last_3m_avg_spend"], descending=True, nulls_last=True)
        .limit(number_of_rows)
    )
    

    # Collect to materialize results
    return outlier_df

In [14]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Diseases ICD Level 1: {compute_emerging_cohorts(get_cohort_diseases_icd_level_1('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 1: {compute_emerging_cohorts(get_cohort_diseases_icd_level_1('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_emerging_cohorts(get_cohort_diseases_icd_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_emerging_cohorts(get_cohort_diseases_icd_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Chronic: {compute_emerging_cohorts(get_cohort_diseases_chronic('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Chronic: {compute_emerging_cohorts(get_cohort_diseases_chronic('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_emerging_cohorts(get_cohort_diseases_trigger_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_emerging_cohorts(get_cohort_diseases_trigger_level_2('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")


Cohort Diseases ICD Level 1: shape: (10, 4)
┌────────────────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                                        ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                ┆               --- ┆           --- ┆          --- │
│ str                                                                ┆               f64 ┆           f64 ┆          f64 │
╞════════════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Diseases of the digestive system                                   ┆          39686.24 ┆      25867.59 ┆        53.42 │
│ Diseases of the circulatory system                                 ┆          41204.29 ┆      34675.20 ┆        18.83 │
│ Mental, Behavioral and Neurodevelopmental disorders                ┆          23949.76 ┆      24795.

In [15]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_emerging_cohorts(get_cohort_procedures('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Procedures: {compute_emerging_cohorts(get_cohort_procedures('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Procedures: shape: (10, 4)
┌─────────────────────────────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                     ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                             ┆               --- ┆           --- ┆          --- │
│ str                                                                             ┆               f64 ┆           f64 ┆          f64 │
╞═════════════════════════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Medical Care | Lump Sum Purchase of DME, Prosthetics, Orthotics | ESRD Supplies ┆              3.40 ┆          0.44 ┆       668.39 │
│ Surgical Dressings or Other Medical Supplies                                    ┆            112.89 ┆         19.73 ┆       472.18 │
│ Pneumococcal/Flu Va

In [16]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_facilities('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_facilities('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                       ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                               ┆               --- ┆           --- ┆          --- │
│ str                               ┆               f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Unassigned                        ┆           4154.96 ┆       2479.36 ┆        67.58 │
│ Rural Health Clinic               ┆            182.02 ┆        117.47 ┆        54.95 │
│ Telehealth                        ┆           2225.29 ┆       1508.20 ┆        47.55 │
│ Ambulatory Surgical Center        ┆          87888.01 ┆      81060.79 ┆         8.42 │
│ Pharmacy                          ┆             52.81 ┆         49.98 ┆         5.67 │
│ ESRD Treatment Facility           ┆           4259.37 ┆       4338.62 ┆   

In [17]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_drugs_usage('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_drugs_usage('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                           ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                                   ┆               --- ┆           --- ┆          --- │
│ str                                                                                   ┆               f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Drugs For Functional Gastrointestinal Disorders                                       ┆             25.53 ┆          2.91 ┆       776.68 │
│ AnalgesicsCough And Cold Preparations                                                 ┆             16.49 ┆          3

In [18]:
from cohorts_demographics import get_cohort_demographics_ages, get_cohort_demographics_relationships, get_cohort_demographics_genders

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_ages('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_ages('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_genders('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_genders('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_relationships('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_relationships('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (5, 4)
┌─────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆               --- ┆           --- ┆          --- │
│ str         ┆               f64 ┆           f64 ┆          f64 │
╞═════════════╪═══════════════════╪═══════════════╪══════════════╡
│ 65+         ┆          25824.07 ┆      19173.22 ┆        34.69 │
│ 41-64       ┆         172037.90 ┆     194667.56 ┆       -11.62 │
│ 27-40       ┆          72701.39 ┆     100778.49 ┆       -27.86 │
│ 0-18        ┆          54507.11 ┆      95361.34 ┆       -42.84 │
│ 19-26       ┆          13704.56 ┆      26932.11 ┆       -49.11 │
└─────────────┴───────────────────┴───────────────┴──────────────┘
Cohort Facilities: shape: (5, 4)
┌─────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆               --- ┆           --- ┆          --

In [20]:
from cohorts_providers import get_cohort_individual_medical_provider, get_cohort_medical_provider_speciality, get_cohort_individual_rx_provider

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_medical_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_medical_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_rx_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_rx_provider('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_medical_provider_speciality('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_medical_provider_speciality('PS').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌─────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name     ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---             ┆               --- ┆           --- ┆          --- │
│ str             ┆               f64 ┆           f64 ┆          f64 │
╞═════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ MICHAEL LEMMON  ┆            461.44 ┆         57.68 ┆       700.00 │
│ NATHAN HUNSAKER ┆            210.35 ┆         26.29 ┆       700.00 │
│ KYLE PRESCOTT   ┆            202.36 ┆         25.29 ┆       700.00 │
│ LISA MERRIMAN   ┆            109.55 ┆         13.69 ┆       700.00 │
│ KURTIS WOOLF    ┆             90.56 ┆         11.32 ┆       700.00 │
│ KENNETH DITTO   ┆             83.82 ┆         10.48 ┆       700.00 │
│ JACOB SMITH     ┆             82.68 ┆         10.33 ┆       700.00 │
│ SEAN STRINGHAM  ┆             59.78 ┆          7.47 ┆       700.00 │
│ SARAH KOHAL     ┆             54.66 ┆    