In [190]:
# Imports

import polars as pl

from compute_top_spending_cohorts import compute_top_spending_cohorts
from compute_emerging_cohorts import compute_emerging_cohorts
from compute_outlier_cohorts import compute_outlier_cohorts

from cohorts_diseases import get_cohort_diseases_icd_level_1, get_cohort_diseases_icd_level_2, get_cohort_diseases_chronic, get_cohort_diseases_trigger_level_2

from cohorts_procedures import get_cohort_procedures

from cohorts_facilities import get_cohort_facilities

from cohorts_drugs import get_cohort_drugs_usage

from cohorts_demographics import get_cohort_demographics_ages, get_cohort_demographics_relationships, get_cohort_demographics_genders

from cohorts_providers import get_cohort_individual_medical_provider, get_cohort_medical_provider_speciality, get_cohort_individual_rx_provider

from top_n_rows import new_top_n_rows

from get_mongodb_connection import get_mongo_collection_from_env

from insert_into_mongodb import insert_spending_analysis

In [191]:
# Polars display configurations

pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_fmt_str_lengths(200)
pl.Config.set_tbl_cell_numeric_alignment("RIGHT")
pl.Config.set_tbl_hide_dataframe_shape(False)
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_width_chars(8000)
pl.Config.set_thousands_separator(",")

polars.config.Config

In [192]:
# Constants

eg_nid = "PS" 
reference_date = "2025-05-05"

mongodb_collection_obj, mongodb_client = get_mongo_collection_from_env()
print(mongodb_collection_obj, mongodb_client)

Successfully connected to MongoDB: admin/cohort_analysis_insights
Collection(Database(MongoClient(host=['10.1.0.4:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', readpreference='primary', tls=False), 'admin'), 'cohort_analysis_insights') MongoClient(host=['10.1.0.4:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', readpreference='primary', tls=False)


# Top Spending Cohorts - Diseases (ICD Level 2)

In [193]:
top_spending_diseases_icd_level_2 = compute_top_spending_cohorts(
    claims_df=get_cohort_diseases_icd_level_2(eg_nid=eg_nid), reference_date=reference_date,
    number_of_rows=5
).collect()

##### Top 5

In [194]:

print(top_spending_diseases_icd_level_2)

shape: (5, 2)
┌────────────────────────────────────────────────────────────────────┬────────────┐
│ cohort_name                                                        ┆      spend │
│ ---                                                                ┆        --- │
│ str                                                                ┆        f64 │
╞════════════════════════════════════════════════════════════════════╪════════════╡
│  Malignant neoplasms of lymphoid, hematopoietic and related tissue ┆ 506,320.65 │
│  Osteoarthritis                                                    ┆ 330,809.30 │
│  Encounters for other specific health care                         ┆ 311,173.55 │
│  Hemorrhagic and hematological disorders of newborn                ┆ 297,557.61 │
│  Acute kidney failure and chronic kidney disease                   ┆ 291,109.99 │
└────────────────────────────────────────────────────────────────────┴────────────┘


---

# Top Spending Cohorts - Procedures (Type of Service)



In [195]:
top_spending_procedures = compute_top_spending_cohorts(
    claims_df=get_cohort_procedures(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=6
).collect()

##### Top 5

In [196]:

print(top_spending_procedures.filter(pl.col("cohort_name") != "*"))

shape: (5, 2)
┌────────────────────────────────────────────────────┬──────────────┐
│ cohort_name                                        ┆        spend │
│ ---                                                ┆          --- │
│ str                                                ┆          f64 │
╞════════════════════════════════════════════════════╪══════════════╡
│ E&M (Office/Outpatient Services)                   ┆ 1,058,813.02 │
│ E&M (Behavioral Health Services)                   ┆   304,546.09 │
│ Other (Ambulance)                                  ┆   277,621.42 │
│ Treatment (Injections and Infusions (nononcolog... ┆   248,133.55 │
│ Anesthesia (Anesthesia)                            ┆   205,667.55 │
└────────────────────────────────────────────────────┴──────────────┘


---

# Top Spending Cohorts - Drug Classes

In [197]:
top_spending_drugs = compute_top_spending_cohorts(
    claims_df=get_cohort_drugs_usage(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=5
).collect()

##### Top 5

In [198]:

print(top_spending_drugs)

shape: (5, 2)
┌───────────────────────────────────────┬──────────────┐
│ cohort_name                           ┆        spend │
│ ---                                   ┆          --- │
│ str                                   ┆          f64 │
╞═══════════════════════════════════════╪══════════════╡
│ Immunosuppressants                    ┆ 2,312,603.13 │
│ Drugs Used In Diabetes                ┆   590,144.06 │
│ Other Nervous System Drugs            ┆   187,774.15 │
│ Drugs For Obstructive Airway Diseases ┆   121,098.38 │
│ Analgesics                            ┆    78,816.68 │
└───────────────────────────────────────┴──────────────┘


---

# Top Sepnding Cohorts - Providers (Medical Speciality)

In [199]:
top_spending_providers = compute_top_spending_cohorts(
    claims_df=get_cohort_medical_provider_speciality(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=6
).collect()

##### Top 5

In [200]:

print(top_spending_providers.filter(pl.col("cohort_name") != "None"))

shape: (5, 2)
┌─────────────────────────────────────────────┬──────────────┐
│ cohort_name                                 ┆        spend │
│ ---                                         ┆          --- │
│ str                                         ┆          f64 │
╞═════════════════════════════════════════════╪══════════════╡
│ Hospital Short Term  General and Specialty  ┆ 1,706,251.37 │
│ Hospital Psychiatric Unit                   ┆   931,914.47 │
│ Ambulance Service Provider                  ┆   531,101.09 │
│ Hospital Rehabilitation Unit                ┆   413,872.97 │
│ Physician Assistant                         ┆   314,240.19 │
└─────────────────────────────────────────────┴──────────────┘


In [201]:
# Concatenation of all 'top spending' dataframes

concatenated_top_spending = pl.concat([top_spending_diseases_icd_level_2, top_spending_procedures, top_spending_drugs, top_spending_providers])
concatenated_top_spending = concatenated_top_spending.filter(pl.col("cohort_name") != "*", pl.col("cohort_name") != "None")

# TOP SPENDING COHORTS - TOP 3 EXPENSES

In [202]:
# Call the function to get top 'n' rows from (concatenated dataframe of top spending analysis type)
top_spending_results = new_top_n_rows(concatenated_top_spending, n=3, analysis_type="top spending")
print(top_spending_results)

shape: (3, 2)
┌────────────────────────────────────────────┬──────────────┐
│ cohort_name                                ┆        value │
│ ---                                        ┆          --- │
│ str                                        ┆          f64 │
╞════════════════════════════════════════════╪══════════════╡
│ Immunosuppressants                         ┆ 2,312,603.13 │
│ Hospital Short Term  General and Specialty ┆ 1,706,251.37 │
│ E&M (Office/Outpatient Services)           ┆ 1,058,813.02 │
└────────────────────────────────────────────┴──────────────┘


In [203]:
# Save the results in mongoDB
insert_spending_analysis(df=top_spending_results, analysis_type = "top spending", collection=mongodb_collection_obj, eg_nid=eg_nid, reference_date=reference_date)

Successfully inserted document with ID: 6839b6da3c38cd7b533c7af8 for analysis type 'top spending' and eg_nid 'PS'.


ObjectId('6839b6da3c38cd7b533c7af8')

---

# Surge in Spending Cohorts - Diseases (ICD Level 2)



In [110]:
surge_in_spending_diseases_icd_level_2 = compute_outlier_cohorts(
    claims_df=get_cohort_diseases_icd_level_2(eg_nid=eg_nid), reference_date=reference_date,
    number_of_rows=10   
).collect()

##### Top 10

In [111]:
surge_in_spending_diseases_icd_level_2 = surge_in_spending_diseases_icd_level_2.sort("pct_increase", descending=True)
print(surge_in_spending_diseases_icd_level_2)

shape: (10, 4)
┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                                                                                                    ┆ avg_3m_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                                                                                                            ┆          --- ┆          --- ┆          --- │
│ str                                                                                                                            ┆          f64 ┆          f64 ┆          f64 │
╞════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│  Diseases of veins, lymphatic vessels and lymph nodes, not elsewhere classified                        

---

# Surge in Spending Cohorts - Procedures (Type of Service)



In [112]:
surge_in_spending_procedures = compute_outlier_cohorts(
    claims_df=get_cohort_procedures(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=11
).collect()

##### Top 10

In [113]:
surge_in_spending_procedures = surge_in_spending_procedures.filter(pl.col("cohort_name") != "*")
surge_in_spending_procedures = surge_in_spending_procedures.sort("pct_increase", descending=True)
print(surge_in_spending_procedures.head(10))

shape: (10, 4)
┌────────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                        ┆ avg_3m_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                                ┆          --- ┆          --- ┆          --- │
│ str                                                ┆          f64 ┆          f64 ┆          f64 │
╞════════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ E&M (Behavioral Health Services)                   ┆    12,959.77 ┆    11,864.55 ┆         9.23 │
│ Other (Vision, Hearing, and Speech Services)       ┆     5,002.19 ┆     4,913.24 ┆         1.81 │
│ Procedure (Digestive/Gastrointestinal)             ┆     3,492.60 ┆     3,679.91 ┆        -5.09 │
│ Treatment (Physical, Occupational, and Speech T... ┆     5,497.41 ┆     5,898.43 ┆        -6.80 │
│ E&M (Ophthalmological Services)                    ┆     5,198.05 ┆     5,875.41 ┆ 

---

# Surge in Spending Cohorts - Drug Classes



In [114]:
surge_in_spending_drugs = compute_outlier_cohorts(
    claims_df=get_cohort_drugs_usage(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=10
).collect()

##### Top 10

In [115]:
surge_in_spending_drugs = surge_in_spending_drugs.filter(pl.col("cohort_name") != "*")
surge_in_spending_drugs = surge_in_spending_drugs.sort("pct_increase", descending=True)
print(surge_in_spending_drugs.head(10))

shape: (10, 4)
┌───────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                       ┆ avg_3m_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                               ┆          --- ┆          --- ┆          --- │
│ str                                               ┆          f64 ┆          f64 ┆          f64 │
╞═══════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ Antivirals For Systemic Use                       ┆     1,461.05 ┆       419.00 ┆       248.70 │
│ Psycholeptics                                     ┆     5,570.42 ┆     1,955.02 ┆       184.93 │
│ Drugs For Obstructive Airway Diseases             ┆     5,542.43 ┆     3,697.98 ┆        49.88 │
│ Analgesics                                        ┆     3,274.94 ┆     2,314.52 ┆        41.50 │
│ Antithrombotic Agents                             ┆     2,758.91 ┆     2,076.99 ┆        32.

---

# Surge in Spending Cohorts - Providers (Medical Speciality)



In [116]:
surge_in_spending_providers_medical_speciality = compute_outlier_cohorts(
    claims_df=get_cohort_medical_provider_speciality(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=10
).collect()

##### Top 10

In [117]:
surge_in_spending_providers_medical_speciality = surge_in_spending_providers_medical_speciality.filter(pl.col("cohort_name") != "None")
surge_in_spending_providers_medical_speciality = surge_in_spending_providers_medical_speciality.filter(pl.col("cohort_name")!= "*")
surge_in_spending_providers_medical_speciality = surge_in_spending_providers_medical_speciality.sort("pct_increase", descending=True)
print(surge_in_spending_providers_medical_speciality.head(10))

shape: (9, 4)
┌─────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                 ┆ avg_3m_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                         ┆          --- ┆          --- ┆          --- │
│ str                                         ┆          f64 ┆          f64 ┆          f64 │
╞═════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ Hospitals                                   ┆     5,250.49 ┆     1,568.93 ┆       234.65 │
│ Hospital Rehabilitation Unit                ┆    25,269.13 ┆    14,265.73 ┆        77.13 │
│ Ambulance Service Provider                  ┆    25,099.33 ┆    17,279.29 ┆        45.26 │
│ Optometry                                   ┆     8,537.59 ┆     8,820.38 ┆        -3.21 │
│ Nurse Practitioner                          ┆     6,930.87 ┆     9,343.63 ┆       -25.82 │
│ Hospital Short Term  General and Specialty  ┆    59,39

In [118]:
# Concatenation of all 'surge in spending' dataframes
concatenated_surge_in_spending = pl.concat([surge_in_spending_diseases_icd_level_2,surge_in_spending_procedures,surge_in_spending_drugs,surge_in_spending_providers_medical_speciality])
concatenated_surge_in_spending = concatenated_surge_in_spending.filter(pl.col("cohort_name") != "*", pl.col("cohort_name") != "None")


# SURGE IN SPENDING - TOP 3

In [119]:
# Call the function to get top 'n' rows from (concatenated dataframe of surge in spending analysis type)
new_top_n_rows_surge = new_top_n_rows(concatenated_surge_in_spending, n=3, analysis_type = "surge in spending")
print(new_top_n_rows_surge)

shape: (3, 2)
┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
│ cohort_name                                                                                                                   ┆  value │
│ ---                                                                                                                           ┆    --- │
│ str                                                                                                                           ┆    f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪════════╡
│ Diseases of veins, lymphatic vessels and lymph nodes, not elsewhere classified                                                ┆ 698.23 │
│ Other osteopathies                                                                                                            ┆ 570.46 │
│ Persons wit

In [None]:
# Save the results in mongoDB
# insert_spending_analysis(df=new_top_n_rows_surge, analysis_type = "surge in spending", collection=mongodb_collection_obj, eg_nid=eg_nid, reference_date=reference_date)

Successfully inserted document with ID: 6839b40d3c38cd7b533c7aed for analysis type 'surge in spending' and eg_nid 'PS'.


ObjectId('6839b40d3c38cd7b533c7aed')

---

# Emerging Spending Cohorts - Diseases (ICD Level 2)


In [24]:
emerging_spending_diseases_icd_level_2 = compute_emerging_cohorts(
    claims_df=get_cohort_diseases_icd_level_2(eg_nid=eg_nid), reference_date=reference_date,
    number_of_rows=10
).collect()

##### Top 10

In [25]:
emerging_spending_diseases_icd_level_2 = emerging_spending_diseases_icd_level_2.sort("pct_increase", descending=True)
emerging_spending_diseases_icd_level_2 = emerging_spending_diseases_icd_level_2.filter(pl.col("cohort_name") != "*")
emerging_spending_diseases_icd_level_2 = emerging_spending_diseases_icd_level_2.filter(pl.col("cohort_name") != "None")
print(emerging_spending_diseases_icd_level_2.head(10))

shape: (10, 4)
┌────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                                                                ┆ avg_1y_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                                                                        ┆          --- ┆          --- ┆          --- │
│ str                                                                                        ┆          f64 ┆          f64 ┆          f64 │
╞════════════════════════════════════════════════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│  Edema, proteinuria and hypertensive disorders in pregnancy, childbirth and the puerperium ┆    10,722.29 ┆     3,664.82 ┆       192.57 │
│  Malignant neoplasms of lymphoid, hematopoietic and related tissue                         ┆    39,685.13 ┆    14,522.33 ┆       173.27 │
│  Is

---

# Emerging Spending Cohorts - Procedures (Type of Service)



In [26]:
emerging_spending_procedures = compute_emerging_cohorts(
    claims_df=get_cohort_procedures(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=10
).collect()

##### Top 10

In [27]:
emerging_spending_procedures = emerging_spending_procedures.filter(pl.col("cohort_name") != "*")
emerging_spending_procedures = emerging_spending_procedures.filter(pl.col("cohort_name") != "None")
emerging_spending_procedures = emerging_spending_procedures.sort("pct_increase", descending=True)
print(emerging_spending_procedures.head(10))

shape: (9, 4)
┌────────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                        ┆ avg_1y_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                                ┆          --- ┆          --- ┆          --- │
│ str                                                ┆          f64 ┆          f64 ┆          f64 │
╞════════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ Treatment (Chemotherapy)                           ┆    15,565.31 ┆     5,737.62 ┆       171.29 │
│ E&M (Behavioral Health Services)                   ┆    12,623.85 ┆    11,864.55 ┆         6.40 │
│ Treatment (Physical, Occupational, and Speech T... ┆     6,248.87 ┆     5,898.43 ┆         5.94 │
│ Procedure (Other Organ Systems)                    ┆     6,292.20 ┆     6,032.22 ┆         4.31 │
│ E&M (Office/Outpatient Services)                   ┆    44,294.36 ┆    43,205.03 ┆  

---

# Emerging Spending Cohorts - Drug Classes



In [28]:
emerging_spending_drugs = compute_emerging_cohorts(
    claims_df=get_cohort_drugs_usage(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=10
).collect()

##### Top 10

In [29]:
emerging_spending_drugs = emerging_spending_drugs.filter(pl.col("cohort_name") != "*")
emerging_spending_drugs = emerging_spending_drugs.filter(pl.col("cohort_name") != "None")
emerging_spending_drugs = emerging_spending_drugs.sort("pct_increase", descending=True)
print(emerging_spending_drugs.head(10))

shape: (10, 4)
┌───────────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                       ┆ avg_1y_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                               ┆          --- ┆          --- ┆          --- │
│ str                                               ┆          f64 ┆          f64 ┆          f64 │
╞═══════════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ Antivirals For Systemic Use                       ┆       980.90 ┆       419.00 ┆       134.10 │
│ Psycholeptics                                     ┆     3,693.63 ┆     1,955.02 ┆        88.93 │
│ Antithrombotic Agents                             ┆     3,571.85 ┆     2,076.99 ┆        71.97 │
│ Drugs For Obstructive Airway Diseases             ┆     5,689.00 ┆     3,699.08 ┆        53.80 │
│ Immunosuppressants                                ┆    97,502.14 ┆    72,594.27 ┆        34.

---

# Emerging Spending Cohorts - Providers (Medical Speciality)



In [30]:
emerging_spending_providers_medical_speciality = compute_emerging_cohorts(
    claims_df=get_cohort_medical_provider_speciality(eg_nid=eg_nid),
    reference_date=reference_date,
    number_of_rows=11
).collect()

##### Top 10

In [31]:
emerging_spending_providers_medical_speciality = emerging_spending_providers_medical_speciality.filter(pl.col("cohort_name") != "*")
emerging_spending_providers_medical_speciality = emerging_spending_providers_medical_speciality.filter(pl.col("cohort_name") != "None")
emerging_spending_providers_medical_speciality = emerging_spending_providers_medical_speciality.sort("pct_increase", descending=True)
print(emerging_spending_providers_medical_speciality.head(10))

shape: (10, 4)
┌─────────────────────────────────────────────┬──────────────┬──────────────┬──────────────┐
│ cohort_name                                 ┆ avg_1y_spend ┆ avg_3y_spend ┆ pct_increase │
│ ---                                         ┆          --- ┆          --- ┆          --- │
│ str                                         ┆          f64 ┆          f64 ┆          f64 │
╞═════════════════════════════════════════════╪══════════════╪══════════════╪══════════════╡
│ Pharmacy                                    ┆    18,566.90 ┆    11,508.85 ┆        61.33 │
│ Hospital Rehabilitation Unit                ┆    18,913.67 ┆    14,265.73 ┆        32.58 │
│ Hospital Psychiatric Unit                   ┆    37,336.91 ┆    28,212.85 ┆        32.34 │
│ Physician General Surgery                   ┆     9,574.85 ┆     7,241.05 ┆        32.23 │
│ Critical Access Hospital                    ┆    10,057.78 ┆     9,471.80 ┆         6.19 │
│ Ambulance Service Provider                  ┆    17,1

In [None]:
# Concatenation of all 'emerging spending' dataframes
concatenated_emerging_spending = pl.concat([emerging_spending_diseases_icd_level_2, emerging_spending_procedures, emerging_spending_drugs, emerging_spending_providers_medical_speciality])


# EMERGING SPENDING - TOP 3

In [33]:
# Call the function to get top 'n' rows from (concatenated dataframe of top spending analysis type)
new_top_n_rows_emerging = new_top_n_rows(concatenated_emerging_spending, n=3, analysis_type = "emerging spending")
print(new_top_n_rows_emerging)




shape: (3, 2)
┌───────────────────────────────────────────────────────────────────────────────────────────┬────────┐
│ cohort_name                                                                               ┆  value │
│ ---                                                                                       ┆    --- │
│ str                                                                                       ┆    f64 │
╞═══════════════════════════════════════════════════════════════════════════════════════════╪════════╡
│ Edema, proteinuria and hypertensive disorders in pregnancy, childbirth and the puerperium ┆ 192.57 │
│ Malignant neoplasms of lymphoid, hematopoietic and related tissue                         ┆ 173.27 │
│ Treatment (Chemotherapy)                                                                  ┆ 171.29 │
└───────────────────────────────────────────────────────────────────────────────────────────┴────────┘


In [None]:
# Save the results in mongoDB
# insert_spending_analysis(df=new_top_n_rows_emerging, analysis_type = "emerging spending", collection=mongodb_collection_obj, eg_nid=eg_nid, reference_date=reference_date)

Successfully inserted document with ID: 6839932f5e509880d3cc16b3 for analysis type 'emerging spending' and eg_nid 'PS'.


ObjectId('6839932f5e509880d3cc16b3')

---