# Top Spending Cohorts

In [1]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit

# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_top_spending_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on historical (up to last 24-months) spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by spend, containing columns:
        cohort_id, cohort_name, spend
    """

    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract historical (up to last 24-months) data
    previous_24_months_to_current_date = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            pl.sum("claim_amount").alias("spend")
        )
        .sort(pl.col("spend"), descending=True, nulls_last=True)
        .limit(number_of_rows)
    )

    # Collect to materialize results
    return previous_24_months_to_current_date

In [2]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Diseases ICD Level 1: {compute_top_spending_cohorts(get_cohort_diseases_icd_level_1('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_top_spending_cohorts(get_cohort_diseases_icd_level_2('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")
    print(f"Cohort Diseases Chronic: {compute_top_spending_cohorts(get_cohort_diseases_chronic('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_top_spending_cohorts(get_cohort_diseases_trigger_level_2('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")


Cohort Diseases ICD Level 1: shape: (10, 2)
┌─────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│ cohort_name                                                                             ┆     spend │
│ ---                                                                                     ┆       --- │
│ str                                                                                     ┆       f64 │
╞═════════════════════════════════════════════════════════════════════════════════════════╪═══════════╡
│ Factors influencing health status and contact with health services                      ┆ 960231.01 │
│ Diseases of the musculoskeletal system and connective tissue                            ┆ 622215.99 │
│ Diseases of the circulatory system                                                      ┆ 502801.34 │
│ Diseases of the digestive system                                                        ┆ 485175.11 │
│ Injury, poisoning 

In [3]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_top_spending_cohorts(get_cohort_procedures('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")


Cohort Procedures: shape: (10, 2)
┌──────────────────────────────────────────────────┬────────────┐
│ cohort_name                                      ┆      spend │
│ ---                                              ┆        --- │
│ str                                              ┆        f64 │
╞══════════════════════════════════════════════════╪════════════╡
│ Medical Care                                     ┆ 3402131.42 │
│ *                                                ┆  700273.54 │
│ Surgery                                          ┆  443115.53 │
│ Diagnostic Laboratory                            ┆  132759.28 │
│ Ambulance                                        ┆  100004.96 │
│ Outpatient Mental Health Treatment Limitation    ┆   98707.98 │
│ Anesthesia                                       ┆   96844.69 │
│ Diagnostic Radiology                             ┆   70014.20 │
│ Lump Sum Purchase of DME, Prosthetics, Orthotics ┆   42642.52 │
│ Other Medical Items or Services         

In [4]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_top_spending_cohorts(get_cohort_facilities('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")

Cohort Facilities: shape: (10, 2)
┌───────────────────────────────┬────────────┐
│ cohort_name                   ┆      spend │
│ ---                           ┆        --- │
│ str                           ┆        f64 │
╞═══════════════════════════════╪════════════╡
│ Ambulatory Surgical Center    ┆ 1385555.15 │
│ On Campus-Outpatient Hospital ┆ 1185741.84 │
│ Office                        ┆ 1136300.99 │
│ Inpatient Hospital            ┆  712230.02 │
│ Emergency Room  Hospital      ┆  451335.23 │
│ Ambulance - Air or Water      ┆   88623.50 │
│ Home                          ┆   71591.96 │
│ Independent Laboratory        ┆   43799.67 │
│ Urgent Care Facility          ┆   41894.18 │
│ Unassigned                    ┆   20488.07 │
└───────────────────────────────┴────────────┘


In [5]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_top_spending_cohorts(get_cohort_drugs_usage('FT').collect(), '2025-04-30', number_of_rows=10).collect()}")

Cohort Facilities: shape: (10, 2)
┌───────────────────────────────────────────────────┬───────────┐
│ cohort_name                                       ┆     spend │
│ ---                                               ┆       --- │
│ str                                               ┆       f64 │
╞═══════════════════════════════════════════════════╪═══════════╡
│ Immunosuppressants                                ┆ 554816.28 │
│ Drugs Used In Diabetes                            ┆ 447198.01 │
│ Other Dermatological Preparations                 ┆ 243785.53 │
│ Drugs For Obstructive Airway Diseases             ┆ 129993.32 │
│ Antithrombotic Agents                             ┆  65000.72 │
│ Antineoplastic Agents                             ┆  62692.24 │
│ Analgesics                                        ┆  40968.96 │
│ Psychoanaleptics                                  ┆  39691.11 │
│ Sex Hormones And Modulators Of The Genital System ┆  37280.68 │
│ Lipid Modifying Agents                  

# Surge in Spending Cohorts

In [6]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit
# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_outlier_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3, include_infinity_percentage: bool = False) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on current month spend vs. last 24-month average spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).
    include_infinity_percentage : bool
        Whether to include cohorts with infinite percentage increase, observed with the average of historical spend is zero(default is False).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by percent increase, containing columns:
        cohort_id, cohort_name, current_month_spend, avg_24m_spend, pct_increase
    """
    
    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    last_30_days_from_reference_date = (reference_date - timedelta(days=30))
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract current month data
    current_month = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(last_30_days_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            pl.sum("claim_amount").alias("current_month_spend")
        )
        .filter(pl.col("current_month_spend") > 0)
    )

    # 2) Compute 24-month window prior to reference month
    historical_24m = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 24).alias("avg_24m_spend")
        )
        .with_columns(pl.col("avg_24m_spend").fill_null(0.0).fill_nan(0.0))
    )

    # 3) Join current month and historical averages
    if include_infinity_percentage:
        outlier_df = (
            current_month
            .join(historical_24m, on=["cohort_name"], how="left")
            .with_columns(
                pl.when(pl.col("avg_24m_spend") != 0)
                .then(((pl.col("current_month_spend") - pl.col("avg_24m_spend")) / 
                    pl.col("avg_24m_spend")) * 100)
                .otherwise(float("inf"))
                .alias("pct_increase")
            )
            .sort(["pct_increase"], descending=True, nulls_last=True)
            .limit(number_of_rows)
        )
    else:
        outlier_df = (
            current_month
            .join(historical_24m, on=["cohort_name"], how="left")
            .with_columns(
                pl.when(pl.col("avg_24m_spend") != 0)
                .then(((pl.col("current_month_spend") - pl.col("avg_24m_spend")) / 
                    pl.col("avg_24m_spend")) * 100)
                .otherwise(float("inf"))
                .alias("pct_increase")
            )
            .filter(pl.col("pct_increase") != float("inf"))
            .sort(["pct_increase"], descending=True, nulls_last=True)
            .limit(number_of_rows)
        )

    # Collect to materialize results
    return outlier_df

In [7]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Diseases ICD Level 1: {compute_outlier_cohorts(get_cohort_diseases_icd_level_1('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 1: {compute_outlier_cohorts(get_cohort_diseases_icd_level_1('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_outlier_cohorts(get_cohort_diseases_icd_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_outlier_cohorts(get_cohort_diseases_icd_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Chronic: {compute_outlier_cohorts(get_cohort_diseases_chronic('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Chronic: {compute_outlier_cohorts(get_cohort_diseases_chronic('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_outlier_cohorts(get_cohort_diseases_trigger_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_outlier_cohorts(get_cohort_diseases_trigger_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")


Cohort Diseases ICD Level 1: shape: (10, 4)
┌─────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                                         ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                                                 ┆                 --- ┆           --- ┆          --- │
│ str                                                                                                 ┆                 f64 ┆           f64 ┆          f64 │
╞═════════════════════════════════════════════════════════════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism ┆             2907.85 ┆        656.56 ┆       342.89 │
│ Diseases of 

In [8]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_outlier_cohorts(get_cohort_procedures('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Procedures: {compute_outlier_cohorts(get_cohort_procedures('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Procedures: shape: (10, 4)
┌───────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                   ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                           ┆                 --- ┆           --- ┆          --- │
│ str                                           ┆                 f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Occupational Therapy                          ┆               18.70 ┆         19.00 ┆        -1.59 │
│ Outpatient Mental Health Treatment Limitation ┆             3223.96 ┆       4112.83 ┆       -21.61 │
│ Other Medical Items or Services               ┆             1132.59 ┆       1451.72 ┆       -21.98 │
│ Diagnostic Radiology                          ┆             2015.44 ┆       2917.26 ┆       -30.91 │
│ Anesthesia                           

In [9]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_facilities('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_facilities('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                       ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                               ┆                 --- ┆           --- ┆          --- │
│ str                               ┆                 f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Federally Qualified Health Center ┆              110.50 ┆         38.21 ┆       189.18 │
│ Off Campus-Outpatient Hospital    ┆              180.90 ┆         80.31 ┆       125.25 │
│ Telehealth                        ┆              728.68 ┆        715.59 ┆         1.83 │
│ On Campus-Outpatient Hospital     ┆            26133.08 ┆      49405.91 ┆       -47.11 │
│ Office                            ┆            22702.75 ┆      47345.87 ┆       -52.05 │
│ Home                              ┆             1298.8

In [10]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_drugs_usage('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_drugs_usage('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────────────────────────────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name                                                           ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                   ┆                 --- ┆           --- ┆          --- │
│ str                                                                   ┆                 f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════════════════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ Antivirals For Systemic Use                                           ┆             8273.31 ┆        385.71 ┆      2044.98 │
│ Antihemorrhagics                                                      ┆               32.95 ┆          2.10 ┆      1467.80 │
│ Diuretics                                                             ┆    

In [11]:
from cohorts_demographics import get_cohort_demographics_ages, get_cohort_demographics_relationships, get_cohort_demographics_genders

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_ages('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_ages('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_genders('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_genders('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_relationships('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_demographics_relationships('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆                 --- ┆           --- ┆          --- │
│ str         ┆                 f64 ┆           f64 ┆          f64 │
╞═════════════╪═════════════════════╪═══════════════╪══════════════╡
│ 27-40       ┆            11763.84 ┆       6224.83 ┆        88.98 │
│ 0-18        ┆             4540.18 ┆       3998.86 ┆        13.54 │
│ 41-64       ┆            48047.24 ┆      43218.98 ┆        11.17 │
│ 65+         ┆             9950.81 ┆      18320.89 ┆       -45.69 │
│ 19-26       ┆             1566.21 ┆       9283.81 ┆       -83.13 │
└─────────────┴─────────────────────┴───────────────┴──────────────┘
Cohort Facilities: shape: (5, 4)
┌─────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆                 --- ┆

In [12]:
from cohorts_providers import get_cohort_individual_medical_provider, get_cohort_medical_provider_speciality, get_cohort_individual_rx_provider

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_medical_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_medical_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_rx_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_individual_rx_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_medical_provider_speciality('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_outlier_cohorts(get_cohort_medical_provider_speciality('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌──────────────────┬─────────────────────┬───────────────┬──────────────┐
│ cohort_name      ┆ current_month_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---              ┆                 --- ┆           --- ┆          --- │
│ str              ┆                 f64 ┆           f64 ┆          f64 │
╞══════════════════╪═════════════════════╪═══════════════╪══════════════╡
│ BRENDA OFFERDAHL ┆              165.20 ┆          6.88 ┆      2300.00 │
│ TYLER THORSON    ┆              110.50 ┆          4.60 ┆      2300.00 │
│ JULIE JOHNSON    ┆              210.03 ┆          8.75 ┆      2300.00 │
│ STEVE MALAND     ┆              880.00 ┆         36.67 ┆      2300.00 │
│ KRYSTAL RAY      ┆              169.00 ┆          7.04 ┆      2300.00 │
│ MICHELLE BENDER  ┆              267.64 ┆         11.15 ┆      2300.00 │
│ DESTINI NAPIER   ┆              123.08 ┆          5.13 ┆      2300.00 │
│ ANDREA KOSTY     ┆              231.92 ┆          9.66 ┆      2300.00 │
│ KE

# Emerging Spending Categories

In [13]:
import polars as pl
from datetime import datetime, timedelta
from sys import exit
# Assuming claims_df is a Polars DataFrame with the following schema:
# - cohort_id: identifier for the cohort (string)
# - cohort_name: human-readable cohort name (string)
# - claim_date: the date of the claim (date/datetime)
# - claim_amount: the dollar amount of the claim (float)


def compute_emerging_cohorts(claims_df: pl.DataFrame, reference_date: str, number_of_rows: int = 3, include_infinity_percentage: bool = False) -> pl.DataFrame:
    """
    Compute the top 3 outlier cohorts based on last 3 months average spend vs. last 24 months average spend.

    Parameters
    ----------
    claims_df : pl.DataFrame
        A DataFrame containing claims with columns: cohort_id, cohort_name, claim_date, claim_amount.
    reference_date : str
        A YYYY-MM-DD string indicating the month for which to compute outliers (e.g., '2025-04-30').
    number_of_rows : int
        The number of rows to return (default is 3).
    include_infinity_percentage : bool
        Whether to include cohorts with infinite percentage increase, observed with the average of historical spend is zero(default is False).

    Returns
    -------
    pl.DataFrame
        A DataFrame with top 3 outlier cohorts sorted by percent increase, containing columns:
        cohort_id, cohort_name, last_3m_avg_spend, historical_24m, pct_increase
    """

    # Parse reference_date into datetime object
    reference_date = datetime.strptime(reference_date, "%Y-%m-%d")
    last_90_days_from_reference_date = (reference_date - timedelta(days=90))
    twenty_four_months_ago_from_reference_date = (reference_date - timedelta(days=365*2)).replace(day=1)

    # Create a LazyFrame for performance
    lf = claims_df.lazy()

    # 1) Extract last 3 months average data
    last_3m_avg_spend = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(last_90_days_from_reference_date)) &
            (pl.col("date") <= pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 3).alias("last_3m_avg_spend")
        )
        .with_columns(pl.col("last_3m_avg_spend").fill_null(0.0).fill_nan(0.0))
        .filter(pl.col("last_3m_avg_spend") > 0)
    )

    # 2) Compute 24-month window prior to reference month
    historical_24m = (
        lf
        .with_columns(
            pl.col("claim_date").cast(pl.Date).alias("date")
        )
        .filter(
            (pl.col("date") >= pl.lit(twenty_four_months_ago_from_reference_date)) &
            (pl.col("date") < pl.lit(reference_date))
        )
        .group_by(pl.col("cohort_name"))
        .agg(
            (pl.sum("claim_amount") / 24).alias("avg_24m_spend")
        )
        .with_columns(pl.col("avg_24m_spend").fill_null(0.0).fill_nan(0.0))
    )

    # 3) Join last 3 months average and historical averages
    if include_infinity_percentage:
        outlier_df = (
        last_3m_avg_spend
        .join(historical_24m, on=["cohort_name"], how="left")
        .with_columns(
            pl.when(pl.col("avg_24m_spend") != 0)
            .then(((pl.col("last_3m_avg_spend") - pl.col("avg_24m_spend")) / 
                pl.col("avg_24m_spend")) * 100)
            .otherwise(float("inf"))
            .alias("pct_increase")
        )
        .sort(["pct_increase", "last_3m_avg_spend"], descending=True, nulls_last=True)
        .limit(number_of_rows)
    )
    else:
        outlier_df = (
        last_3m_avg_spend
        .join(historical_24m, on=["cohort_name"], how="left")
        .with_columns(
            pl.when(pl.col("avg_24m_spend") != 0)
            .then(((pl.col("last_3m_avg_spend") - pl.col("avg_24m_spend")) / 
                pl.col("avg_24m_spend")) * 100)
            .otherwise(float("inf"))
            .alias("pct_increase")
        )
        .filter(pl.col("pct_increase") != float("inf"))
        .sort(["pct_increase", "last_3m_avg_spend"], descending=True, nulls_last=True)
        .limit(number_of_rows)
    )
    

    # Collect to materialize results
    return outlier_df

In [14]:
from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Diseases ICD Level 1: {compute_emerging_cohorts(get_cohort_diseases_icd_level_1('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 1: {compute_emerging_cohorts(get_cohort_diseases_icd_level_1('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_emerging_cohorts(get_cohort_diseases_icd_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases ICD Level 2: {compute_emerging_cohorts(get_cohort_diseases_icd_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Chronic: {compute_emerging_cohorts(get_cohort_diseases_chronic('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Chronic: {compute_emerging_cohorts(get_cohort_diseases_chronic('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_emerging_cohorts(get_cohort_diseases_trigger_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Diseases Trigger Level 2: {compute_emerging_cohorts(get_cohort_diseases_trigger_level_2('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")


Cohort Diseases ICD Level 1: shape: (10, 4)
┌─────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                                                                         ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                                                 ┆               --- ┆           --- ┆          --- │
│ str                                                                                                 ┆               f64 ┆           f64 ┆          f64 │
╞═════════════════════════════════════════════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Certain infectious and parasitic diseases                                                           ┆           7327.43 ┆       3229.73 ┆       126.87 │
│ Diseases of the blood an

In [15]:
from cohorts_procedures import get_cohort_procedures

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Procedures: {compute_emerging_cohorts(get_cohort_procedures('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Procedures: {compute_emerging_cohorts(get_cohort_procedures('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Procedures: shape: (10, 4)
┌────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                            ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                    ┆               --- ┆           --- ┆          --- │
│ str                                                    ┆               f64 ┆           f64 ┆          f64 │
╞════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Surgical Dressings or Other Medical Supplies           ┆             72.00 ┆         13.24 ┆       443.84 │
│ Medical Care | Occupational Therapy | Physical Therapy ┆             64.87 ┆         30.81 ┆       110.56 │
│ Lump Sum Purchase of DME, Prosthetics, Orthotics       ┆           2578.44 ┆       1776.77 ┆        45.12 │
│ Surgery | Occupational Therapy | Physical Therapy      ┆             44.08 ┆        

In [16]:
from cohorts_facilities import get_cohort_facilities
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_facilities('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_facilities('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                       ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                               ┆               --- ┆           --- ┆          --- │
│ str                               ┆               f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Federally Qualified Health Center ┆             73.20 ┆         38.21 ┆        91.56 │
│ Emergency Room  Hospital          ┆          26585.90 ┆      18805.63 ┆        41.37 │
│ Off Campus-Outpatient Hospital    ┆            113.27 ┆         80.31 ┆        41.03 │
│ Telehealth                        ┆            947.80 ┆        715.59 ┆        32.45 │
│ Rural Health Clinic               ┆            662.62 ┆        561.94 ┆        17.92 │
│ Urgent Care Facility              ┆           1813.78 ┆       1745.59 ┆   

In [17]:
from cohorts_drugs import get_cohort_drugs_usage
pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_drugs_usage('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_drugs_usage('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌───────────────────────────────────────────────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                                                           ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                                                                   ┆               --- ┆           --- ┆          --- │
│ str                                                                   ┆               f64 ┆           f64 ┆          f64 │
╞═══════════════════════════════════════════════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ Antivirals For Systemic Use                                           ┆           2801.05 ┆        385.71 ┆       626.21 │
│ Antihistamines For Systemic Use                                       ┆             13.75 ┆          2.15 ┆       540.65 │
│ Antihemorrhagics                                                      ┆             10.98

In [18]:
from cohorts_demographics import get_cohort_demographics_ages, get_cohort_demographics_relationships, get_cohort_demographics_genders

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_ages('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_ages('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_genders('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_genders('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_relationships('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_demographics_relationships('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (5, 4)
┌─────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆               --- ┆           --- ┆          --- │
│ str         ┆               f64 ┆           f64 ┆          f64 │
╞═════════════╪═══════════════════╪═══════════════╪══════════════╡
│ 0-18        ┆          11974.68 ┆       3998.86 ┆       199.45 │
│ 19-26       ┆          22845.95 ┆       9283.81 ┆       146.08 │
│ 27-40       ┆          15092.64 ┆       6224.83 ┆       142.46 │
│ 41-64       ┆          90464.96 ┆      43218.98 ┆       109.32 │
│ 65+         ┆          15403.78 ┆      18320.89 ┆       -15.92 │
└─────────────┴───────────────────┴───────────────┴──────────────┘
Cohort Facilities: shape: (5, 4)
┌─────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---         ┆               --- ┆           --- ┆          --

In [19]:
from cohorts_providers import get_cohort_individual_medical_provider, get_cohort_medical_provider_speciality, get_cohort_individual_rx_provider

pl.Config.set_tbl_width_chars(8000) 
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
with pl.Config(fmt_str_lengths=200, set_fmt_float="full", float_precision=2):

    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_medical_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_medical_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_rx_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_individual_rx_provider('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_medical_provider_speciality('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=True).collect()}")
    print(f"Cohort Facilities: {compute_emerging_cohorts(get_cohort_medical_provider_speciality('FT').collect(), '2025-04-30', number_of_rows=10, include_infinity_percentage=False).collect()}")

Cohort Facilities: shape: (10, 4)
┌─────────────────────────────┬───────────────────┬───────────────┬──────────────┐
│ cohort_name                 ┆ last_3m_avg_spend ┆ avg_24m_spend ┆ pct_increase │
│ ---                         ┆               --- ┆           --- ┆          --- │
│ str                         ┆               f64 ┆           f64 ┆          f64 │
╞═════════════════════════════╪═══════════════════╪═══════════════╪══════════════╡
│ SIERRA MILLER               ┆            227.62 ┆         28.45 ┆       700.00 │
│ ERIN STORCK                 ┆            101.81 ┆         12.73 ┆       700.00 │
│ ANDREA KOSTY                ┆             77.31 ┆          9.66 ┆       700.00 │
│ DESTINI NAPIER              ┆             41.03 ┆          5.13 ┆       700.00 │
│ MICHAEL WALKER              ┆             30.83 ┆          3.85 ┆       700.00 │
│ RAYMOND EDWARDS             ┆             10.13 ┆          1.27 ┆       700.00 │
│ NATHAN SEEDALL              ┆           1079.49 ┆  