### Load libraries

In [849]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import geopandas as gpd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


### Read data

In [850]:
cost = pd.read_csv('../data/processed/cost_observed.csv')
low = gpd.read_file('../data/processed/jkt_prov.shp')
high = gpd.read_file('../data/processed/jkt_dist.shp')
rf_preds = pd.read_csv('../data/processed/rf_preds.csv')

In [851]:
rf_preds.dropna()["Predicted_Cases"].nunique()

6

### Process data

In [852]:
#first method of disaggregation, divide the cases evenly among all the districts
#firstly, extract the total number of cases in DKI Jakarta, so high['Cases'] when high['Province'] == 'DKI Jakarta', for every year
df_jakarta = low[low['Province'] == 'DKI Jakarta']

df_jakarta.head()

Unnamed: 0,Province,Year,Cases,Population,HDI,Area_sq_km,Pop_den,urbanicity,geometry
0,DKI Jakarta,2020,1217,801232,0.82088,653.83,1.225444,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
1,DKI Jakarta,2021,1048,796729,0.8234,653.83,1.218557,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
2,DKI Jakarta,2022,2113,792228,0.82818,650.5,1.217875,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
3,DKI Jakarta,2023,1562,787726,0.83372,650.25,1.21142,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."


In [853]:
#create a dataframe containing total number of cases in dengue in each year from dki jakarta n west java df
df_jakarta_yearly_total_cases = pd.DataFrame(df_jakarta.groupby('Year')['Cases'].sum())

In [854]:
#check head
df_jakarta_yearly_total_cases.head()

Unnamed: 0_level_0,Cases
Year,Unnamed: 1_level_1
2020,1217
2021,1048
2022,2113
2023,1562


### Disaggregation with equal proportions and cost calculations

In [855]:
#from this, extract the total number of districts in DKI Jakarta from high df, so count the number of unique values of district when province column = dki jakarta
df_jakarta_districts = len(pd.DataFrame(high[high['Province'] == 'DKI Jakarta']['District'].unique()))

In [856]:
print(df_jakarta_districts)

5


In [857]:
#list out all the district - level dengue cases estimates each year
year_2020_jkt = int(df_jakarta_yearly_total_cases.loc[2020] / df_jakarta_districts)
year_2021_jkt = int(df_jakarta_yearly_total_cases.loc[2021] / df_jakarta_districts)
year_2022_jkt = int(df_jakarta_yearly_total_cases.loc[2022] / df_jakarta_districts)
year_2023_jkt = int(df_jakarta_yearly_total_cases.loc[2023] / df_jakarta_districts)

  year_2020_jkt = int(df_jakarta_yearly_total_cases.loc[2020] / df_jakarta_districts)
  year_2021_jkt = int(df_jakarta_yearly_total_cases.loc[2021] / df_jakarta_districts)
  year_2022_jkt = int(df_jakarta_yearly_total_cases.loc[2022] / df_jakarta_districts)
  year_2023_jkt = int(df_jakarta_yearly_total_cases.loc[2023] / df_jakarta_districts)


In [858]:
#append the disaggregated counts to the high_res file ina  new column called cases_bm_equal, only if the Province column is dki jakarta
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2020), 'cases_bm_equal'] = year_2020_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2021), 'cases_bm_equal'] = year_2021_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2022), 'cases_bm_equal'] = year_2022_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2023), 'cases_bm_equal'] = year_2023_jkt

In [859]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0


In [860]:
#append the cases_bm_equal column to NBM table
cost = cost.merge(high[["Province", "District", "Year", "cases_bm_equal"]], on = ["Province", "District", "Year"], how = "left")

In [861]:
len(cost.dropna())

20

In [862]:
#calculate incremental cost for disaggregation with equal proportions
#aggregate again, calculate sum_cases based on cases_bm_equal predictions

cost['equal_bm_prevalence'] = cost['cases_bm_equal'] / cost['Population']

mean_obs_prev = cost["equal_bm_prevalence"].mean()
cost["prev_factor"] = cost["equal_bm_prevalence"] / mean_obs_prev


In [863]:
# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["equal_bm_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["equal_bm_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

### Population-proportion disaggregation and cost calculations

In [864]:
#get df containing yearly total populations
df_jakarta_yearly_total_pop = pd.DataFrame(df_jakarta.groupby('Year')['Population'].sum())
df_jakarta_yearly_total_pop.head()

Unnamed: 0_level_0,Population
Year,Unnamed: 1_level_1
2020,801232
2021,796729
2022,792228
2023,787726


In [865]:
#merge the two dfs by Year
df_jakarta_final = df_jakarta_yearly_total_cases.merge(df_jakarta_yearly_total_pop, on='Year')

#get another column called df_jakarta_final and df_java_final called Province, for merging
df_jakarta_final['Province'] = 'DKI Jakarta'

#append the suffix _Prov to both "Cases" and "Population" c0olumns in df_jakarta_final and df_java_final:
df_jakarta_final.columns = [col + '_Prov' if col in ['Cases', 'Population'] else col for col in df_jakarta_final.columns]

In [866]:
# Reset index of df_jakarta_final and df_java_final to make 'Year' a regular column
df_jakarta_final = df_jakarta_final.reset_index()

#check
df_jakarta_final.head()

Unnamed: 0,Year,Cases_Prov,Population_Prov,Province
0,2020,1217,801232,DKI Jakarta
1,2021,1048,796729,DKI Jakarta
2,2022,2113,792228,DKI Jakarta
3,2023,1562,787726,DKI Jakarta


In [867]:
#now append this to the high-res dataset: so it means that if the year is 2020 cases_prop_bm = high_res["Population"]/df_jakarta_final["Population"] * df_jakarta_final["Cases"]

#merge to the high_res df by Year and Province
high = pd.merge(high, df_jakarta_final[['Year', 'Province', 'Population_Prov', 'Cases_Prov']], on=['Year', 'Province'], how='left')


In [868]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal,Population_Prov,Cases_Prov
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0,801232,1217
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0,796729,1048
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0,792228,2113
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0,787726,1562
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0,801232,1217


In [869]:
#calculate the disaggregation by multiplying the total num of cases per province with the pop in the district / pop in the province
high['cases_prop_bm'] = (high['Population'] / high['Population_Prov']) * high['Cases_Prov']

In [870]:
#cost calculations
#merge this with cost df
cost = pd.merge(cost, high[['Province', 'District', 'Year', 'cases_prop_bm']], on=['Province', 'District', 'Year'], how='left')

In [871]:
len(cost.dropna())

20

In [872]:
#calculate nbm for disaggregation with population-based proportions
# Scale total cost to produce district-level cost per year based on cases per district per year / total num of cases over 2020-2024 nationwide, assuming that healthcare costs for dengue treatment is linearly related to the number of dengue cases per year
# this cost factor measures the scale of the district-level, yearly dengue cases to the national-level dengue cases in total from 2020-2024.
cost['prop_bm_prevalence'] = cost['cases_prop_bm'] / cost['Population']

mean_obs_prev = cost["prop_bm_prevalence"].mean()
cost["prev_factor"] = cost["prop_bm_prevalence"] / mean_obs_prev

# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [873]:
cost.head()

Unnamed: 0.1,Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,...,obs_incremental_healthcare_cost,obs_incremental_payer_cost,cases_bm_equal,equal_bm_prevalence,equal_bm_incremental_healthcare_cost,equal_bm_incremental_payer_cost,cases_prop_bm,prop_bm_prevalence,incremental_healthcare_cost,incremental_payer_cost
0,0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,...,141.14232,141.124821,243.0,0.001315,183.574348,183.551589,280.734222,0.001519,246.085096,246.054587
1,1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,...,120.979131,120.964133,209.0,0.00114,157.889048,157.869474,241.069372,0.001315,211.31581,211.289612
2,2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,...,366.547831,366.502388,422.0,0.002322,317.371673,317.332327,484.663247,0.002667,422.941237,422.888803
3,3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,...,313.311122,313.272279,312.0,0.001732,234.64446,234.61537,357.241424,0.001983,311.746621,311.707973
4,4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,...,276.060659,276.026434,243.0,0.003322,455.178164,455.121733,111.093142,0.001519,241.460796,241.43086


### Random-forest-based (hdi and pop den) disaggregation and cost calculations

In [874]:
#merge the rf_preds cases to the high and nbm dfs:
#rename Predicted_Cases to rf_preds
rf_preds = rf_preds.rename(columns={'Predicted_Cases': 'rf_preds'})
high = pd.merge(high, rf_preds[["HDI","urbanicity", "rf_preds"]], on = ["HDI","urbanicity"], how='left')

In [875]:
len(rf_preds['rf_preds'].unique())

6

In [876]:
#check the columns for high
high.dropna(subset=['rf_preds'])["rf_preds"].shape[0]

6

In [877]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal,Population_Prov,Cases_Prov,cases_prop_bm,rf_preds
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0,801232,1217,280.734222,183.108
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0,796729,1048,241.069372,
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0,792228,2113,484.663247,
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0,787726,1562,357.241424,
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0,801232,1217,111.093142,


In [878]:
cost = cost.merge(high[["Year", "District", "rf_preds"]], on = ["Year", "District"], how = "left")

In [879]:
cost.isna().sum()

Unnamed: 0                               0
District                                 0
Year                                     0
Area_sq_km                               0
HDI                                      0
Province                                 0
Cases                                    0
Population                               0
Pop_den                                  0
urbanicity                               0
geometry                                 0
le_factor                                0
burden_non_vaccine                       0
burden_vaccine                           0
healthcare_cost_non_vaccine              0
healthcare_cost_vaccine                  0
payer_cost_non_vaccine                   0
payer_cost_vaccine                       0
mean_urbanicity                          0
mean_pop_den                             0
urbanicity_factor                        0
pop_den_factor                           0
obs_prevalence                           0
prev_factor

In [880]:
# calculate nbm for rf
# Scale total cost to produce district-level cost per year based on cases per district per year / total num of cases over 2020-2024 nationwide, assuming that healthcare costs for dengue treatment is linearly related to the number of dengue cases per year
# this cost factor measures the scale of the district-level, yearly dengue cases to the national-level dengue cases in total from 2020-2024.
cost['rf_prevalence'] = cost['rf_preds'] / cost["Population"]


mean_obs_prev = cost["rf_prevalence"].mean()
cost["prev_factor"] = cost["rf_prevalence"] / mean_obs_prev

# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["rf_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["rf_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [881]:
print(cost.shape[0])


20


### aggGP-based disaggregation and cost calculations

In [882]:
#read in the gp-preds data for all years and append them
gp_2020 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2020.csv")
gp_2021 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2021.csv")
gp_2022 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2022.csv")
gp_2023 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2023.csv")

In [883]:
gp_2020.head()

Unnamed: 0.1,Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,incidence,pred_cases
0,0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,POLYGON ((106.70503370000006 -6.09559559999996...,0.000871,0.001673
1,4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,POLYGON ((106.83246630000008 -6.14137699999997...,0.001736,0.002774
2,8,KOTA JAKARTA SELATAN,2020,154.32,0.8472,DKI Jakarta,288,160968,1.043079,0.972088,POLYGON ((106.83399450000007 -6.20492189999993...,0.001789,0.002233
3,12,KOTA JAKARTA TIMUR,2020,182.7,0.8266,DKI Jakarta,396,238497,1.305402,0.930291,POLYGON ((106.94010940000004 -6.15492309999996...,0.00166,0.001884
4,16,KOTA JAKARTA UTARA,2020,139.99,0.8029,DKI Jakarta,245,143801,1.027223,0.850775,MULTIPOLYGON (((106.87676710000005 -6.10030039...,0.001704,0.002771


In [884]:
#delete the first column for all the datasets and reset index, also combine all the predictons into one df
# Define function to clean each DataFrame
def clean_df(df):
    df = df.drop(columns=["Unnamed: 0"], errors="ignore")
    return df.reset_index(drop=True)

# Clean all
gp_2020 = clean_df(gp_2020)
gp_2021 = clean_df(gp_2021)
gp_2022 = clean_df(gp_2022)
gp_2023 = clean_df(gp_2023)

# Combine
gp_all_years = pd.concat([gp_2020, gp_2021, gp_2022, gp_2023], axis=0).reset_index(drop=True)


In [885]:
# Calculate the gp_pred_cases from pred_cases * Population
gp_all_years["gp_preds"] = gp_all_years["pred_cases"] * gp_all_years["Population"]

# Rename 'pred_cases' to 'gp_prevalence'
gp_all_years = gp_all_years.rename(columns={"pred_cases": "gp_prevalence"})

# Subset the relevant columns
gp_subset = gp_all_years[["District", "Year", "gp_prevalence", "gp_preds"]]

# Merge by District and Year
cost = pd.merge(cost, gp_subset, on=["District", "Year"], how="left")


In [886]:
# Scale costs
mean_obs_prev = cost["gp_prevalence"].mean()
cost["prev_factor"] = cost["gp_prevalence"] / mean_obs_prev

# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["gp_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["gp_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

### aggVAE-based disaggregation and cost calculations

In [887]:
#read in the gp-preds data for all years and append them
vae_2020 = pd.read_csv("../data/processed/df_hi_jkt_2020_aggVAE_preds.csv")
vae_2021 = pd.read_csv("../data/processed/df_hi_jkt_2021_aggVAE_preds.csv")
vae_2022 = pd.read_csv("../data/processed/df_hi_jkt_2022_aggVAE_preds.csv")
vae_2023 = pd.read_csv("../data/processed/df_hi_jkt_2023_aggVAE_preds.csv")

In [888]:
#delete the first column for all the datasets and reset index, also combine all the predictons into one df
# Define function to clean each DataFrame
def clean_df(df):
    df = df.drop(columns=["Unnamed: 0"], errors="ignore")
    return df.reset_index(drop=True)

# Clean all
vae_2020 = clean_df(vae_2020)
vae_2021 = clean_df(vae_2021)
vae_2022 = clean_df(vae_2022)
vae_2023 = clean_df(vae_2023)

# Combine
vae_all_years = pd.concat([vae_2020, vae_2021, vae_2022, vae_2023], axis=0).reset_index(drop=True)


In [889]:
# Calculate the vae_pred_cases from pred_cases * Population
vae_all_years["vae_preds"] = vae_all_years["pred_cases"] * vae_all_years["Population"]

# Rename 'pred_cases' to 'vae_prevalence'
vae_all_years = vae_all_years.rename(columns={"pred_cases": "vae_prevalence"})

# Subset the relevant columns
vae_subset = vae_all_years[["District", "Year", "vae_prevalence", "vae_preds"]]

# Merge by District and Year
cost = pd.merge(cost, vae_subset, on=["District", "Year"], how="left")

In [890]:
# Scale costs
# Scale costs
mean_obs_prev = cost["vae_prevalence"].mean()
cost["prev_factor"] = cost["vae_prevalence"] / mean_obs_prev

# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["vae_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["vae_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [891]:
cost = cost.reset_index(drop=True)


In [892]:
cost = cost.drop(cost.columns[0], axis=1)


In [893]:
print(len(cost))          # Should be 20
print(cost.shape[0])      # Also 20
print(cost.index)         # Will show RangeIndex(start=0, stop=20, step=1)


20
20
RangeIndex(start=0, stop=20, step=1)


In [894]:
cost.filter(like="rf").isna().sum()

rf_preds                          14
rf_prevalence                     14
rf_incremental_healthcare_cost    14
rf_incremental_payer_cost         14
dtype: int64

In [895]:
print(cost.obs_incremental_healthcare_cost.tolist())


[141.14231950806925, 120.9791310069165, 366.5478310869733, 313.3111222862463, 276.06065872985715, 117.38012260954554, 474.012328603644, 363.88825226138346, 207.90667211272472, 202.85338494331825, 410.4413938524525, 229.0478190412564, 231.08288330736423, 226.99808486506237, 321.2072911142632, 278.1115007143171, 170.63826141352365, 129.54578213434866, 265.8025666328673, 178.1649679386273]


In [896]:
print(cost.gp_incremental_healthcare_cost.tolist())

[222.9350696212487, 216.5700444259876, 228.5470232545651, 222.96527752516732, 362.6934773920263, 311.67093654155764, 416.80267614053645, 427.4571449325198, 213.4444384987851, 212.25711463462196, 233.3962040616091, 222.4957959091688, 215.68181607201967, 218.39611798453157, 214.18672162572585, 206.3105959606751, 228.20606243901977, 228.8663652472419, 217.82155415761343, 212.89039015862713]


In [897]:
# define concordance correlation coefficient taken from: https://github.com/stylianos-kampakis/supervisedPCA-Python/blob/master/Untitled.py
# mentioned in the github post: https://rowannicholls.github.io/python/statistics/agreement/concordance_correlation_coefficient.html; referencing these papers:Giavarina D. Understanding Bland Altman analysis. Biochemia Medica. 2015;25(2):141–51. DOI: 10.11613/BM.2015.015.
# Lin LI-K. A Concordance Correlation Coefficient to Evaluate Reproducibility. Biometrics. 1989;45(1):255-268. DOI: 10.2307/2532051. PMID: 2720055.

def concordance_correlation_coefficient(y_true, y_pred,
                       sample_weight=None,
                       multioutput='uniform_average'):
    """Concordance correlation coefficient.

    The concordance correlation coefficient is a measure of inter-rater agreement.
    It measures the deviation of the relationship between predicted and true values
    from the 45 degree angle.

    Read more: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient
    Original paper: Lawrence, I., and Kuei Lin. "A concordance correlation coefficient to evaluate reproducibility." Biometrics (1989): 255-268.  

    Parameters
    ----------
    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Estimated target values.

    Returns
    -------
    loss : A float in the range [-1,1]. A value of 1 indicates perfect agreement
    between the true and the predicted values.

    Examples
    --------
    >>> from sklearn.metrics import concordance_correlation_coefficient
    >>> y_true = [3, -0.5, 2, 7]
    >>> y_pred = [2.5, 0.0, 2, 8]
    >>> concordance_correlation_coefficient(y_true, y_pred)
    0.97678916827853024


    """
    cor=np.corrcoef(y_true,y_pred)[0][1]
    
    mean_true=np.mean(y_true)
    mean_pred=np.mean(y_pred)
    
    var_true=np.var(y_true)
    var_pred=np.var(y_pred)
    
    sd_true=np.std(y_true)
    sd_pred=np.std(y_pred)
    
    numerator=2*cor*sd_true*sd_pred
    
    denominator=var_true+var_pred+(mean_true-mean_pred)**2

    return numerator/denominator

## Evaluation with RF included

### Evaluate the raw case counts

In [898]:
cost = cost.dropna(subset = ["rf_preds"])

In [899]:
# Define model predictions for dengue case counts
model_preds = {
    'BM Equal': 'cases_bm_equal',
    'BM Proportional': 'cases_prop_bm',
    'Random Forest': 'rf_preds',
    'GP': 'gp_preds',
    'aggVAE': 'vae_preds'
}

# Store results
results = []

# Loop over each district
for district, group in cost.groupby('District'):
    avg_cases = group['Cases'].mean()

    for model_name, pred_col in model_preds.items():
        y_true = group['Cases']
        y_pred = group[pred_col]

        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mape = np.mean(np.abs((y_true - y_pred) / y_true.replace(0, np.nan))) * 100  # Handle divide-by-zero
        mae_pct_avg = (mae / avg_cases) * 100 if avg_cases != 0 else np.nan
        
        # Calculate bias (MFE)
        bias = np.mean(y_pred - y_true)

        results.append({
            'District': district,
            'Model': model_name,
            'RMSE': rmse,
            'MAE (% of avg cases)': mae_pct_avg,
            'Bias': bias
        })

# Convert to DataFrame
metrics_df = pd.DataFrame(results)

# Optional: view overall average for each model
summary = metrics_df.groupby('Model')[['RMSE', 'MAE (% of avg cases)', 'Bias']].mean().reset_index()

print(summary)

             Model        RMSE  MAE (% of avg cases)        Bias
0         BM Equal   93.303968             53.311624    2.500000
1  BM Proportional   59.383942             28.661034   -1.697782
2               GP  107.192547             57.303251  106.387488
3    Random Forest   45.759249             34.678141   44.449900
4           aggVAE   55.972948             28.050106   22.875448


### Evaluate the disease prevalence using the same metrics but + CCC for ranking and accuracy preservation metric!

In [900]:
# Define prevalence prediction columns
prevalence_preds = {
    'BM Equal': 'equal_bm_prevalence',
    'BM Proportional': 'prop_bm_prevalence',
    'Random Forest': 'rf_prevalence',
    'GP': 'gp_prevalence',
    'aggVAE': 'vae_prevalence'
}

# Store prevalence evaluation results
prevalence_results = []

# Loop over each district
for district, group in cost.groupby('District'):
    avg_obs_prev = group['obs_prevalence'].mean()

    for model_name, pred_col in prevalence_preds.items():
        y_true = group['obs_prevalence']
        y_pred = group[pred_col]

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)

        # CCC (Concordance Correlation Coefficient)
        if y_true.nunique() > 1 and y_pred.nunique() > 1:
            ccc_value = concordance_correlation_coefficient(y_true, y_pred)
        else:
            ccc_value = np.nan  # not computable if constant series
        
        # Calculate bias (MFE)
        bias = np.mean(y_pred - y_true)

        #calculate mae as a percentage of average prevalence per district
        mae_pct_avg = (mae / avg_cases) * 100 if avg_cases != 0 else np.nan

        prevalence_results.append({
            'District': district,
            'Model': model_name,
            'RMSE': rmse,
            'MAE (%)': mae_pct_avg,
            'CCC': ccc_value,
            "Bias": bias
        })

# Convert to DataFrame
prevalence_metrics_df = pd.DataFrame(prevalence_results)

# Optional: view model-level average
prevalence_summary = prevalence_metrics_df.groupby('Model')[['RMSE', 'MAE (%)', 'CCC', "Bias"]].mean().reset_index()

print(prevalence_summary)

             Model      RMSE   MAE (%)       CCC      Bias
0         BM Equal  0.000727  0.000297  0.165273  0.000306
1  BM Proportional  0.000368  0.000143  0.720643  0.000025
2               GP  0.000781  0.000314  0.175971  0.000770
3    Random Forest  0.000492  0.000197  0.260160  0.000479
4           aggVAE  0.000394  0.000158  0.341941  0.000189


### Evaluate the incremental cost (healthcare + payer's perspectives separately)

In [902]:
# Define incremental cost prediction columns (for healthcare and payer)
cost_preds = {
    'BM Equal': 'equal_bm_incremental_healthcare_cost',
    'BM Proportional': 'incremental_healthcare_cost',
    'Random Forest': 'rf_incremental_healthcare_cost',
    'GP': 'gp_incremental_healthcare_cost',
    'aggVAE': 'vae_incremental_healthcare_cost'
}

payer_preds = {
    'BM Equal': 'equal_bm_incremental_payer_cost',
    'BM Proportional': 'incremental_payer_cost',
    'Random Forest': 'rf_incremental_payer_cost',
    'GP': 'gp_incremental_payer_cost',
    'aggVAE': 'vae_incremental_payer_cost'
}

# Store incremental cost evaluation results
cost_results = []
payer_results = []

# Loop over each district for healthcare cost
for district, group in cost.groupby('District'):
    avg_healthcare_cost = group['healthcare_cost_vaccine'].mean()  # Or non-vaccine, depending on focus

    for model_name, pred_col in cost_preds.items():
        y_true = group['healthcare_cost_vaccine']
        y_pred = group[pred_col]

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)

        # Calculate bias (MFE)
        bias = np.mean(y_pred - y_true)

        #calculate mae as a percentage of average prevalence per district
        mae_pct_avg = (mae / avg_healthcare_cost) * 100 if avg_healthcare_cost != 0 else np.nan

        cost_results.append({
            'District': district,
            'Model': model_name,
            'RMSE': rmse,
            'MAE (%)': mae_pct_avg,
            "Bias": bias
        })

# Loop over each district for payer cost
for district, group in cost.groupby('District'):
    avg_payer_cost = group['payer_cost_vaccine'].mean()  # Or non-vaccine, depending on focus

    for model_name, pred_col in payer_preds.items():
        y_true = group['payer_cost_vaccine']
        y_pred = group[pred_col]

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        
        # Calculate bias (MFE)
        bias = np.mean(y_pred - y_true)

        #calculate mae as a percentage of average prevalence per district
        mae_pct_avg = (mae / avg_healthcare_cost) * 100 if avg_healthcare_cost != 0 else np.nan


        payer_results.append({
            'District': district,
            'Model': model_name,
            'RMSE': rmse,
            'MAE (%)': mae_pct_avg,
            'Bias': bias
        })

# Convert to DataFrame
cost_metrics_df = pd.DataFrame(cost_results)
payer_metrics_df = pd.DataFrame(payer_results)

# Optional: view overall average for each model (healthcare cost)
cost_summary = cost_metrics_df.groupby('Model')[['RMSE', "MAE (%)", "Bias"]].mean().reset_index()
payer_summary = payer_metrics_df.groupby('Model')[['RMSE', "MAE (%)", "Bias"]].mean().reset_index()

print(cost_summary)
print(payer_summary)

             Model        RMSE    MAE (%)       Bias
0         BM Equal  152.611121  54.899511 -46.423766
1  BM Proportional   76.662557  28.609650 -59.600167
2               GP   60.605707  21.552565 -16.565964
3    Random Forest  134.225593  45.490384 -31.722836
4           aggVAE   65.935626  24.355213 -62.029754
             Model        RMSE    MAE (%)       Bias
0         BM Equal  149.166488  53.653327 -40.901228
1  BM Proportional   72.486075  26.953599 -54.075996
2               GP   57.151459  20.310515 -11.047128
3    Random Forest  130.659159  44.245366 -26.202121
4           aggVAE   61.058678  22.699690 -56.505281
