### Description: This script is used to implement section 5.3 and produces values for tables 12 - 15.

### Load libraries

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import geopandas as gpd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


### Read data

In [2]:
cost = pd.read_csv('../data/processed/cost_observed.csv')
low = gpd.read_file('../data/processed/jkt_prov.shp')
high = gpd.read_file('../data/processed/jkt_dist.shp')
rf_preds = pd.read_csv('../data/processed/rf_preds.csv')

In [3]:
rf_preds.dropna()["Predicted_Cases"].nunique()

6

### Process data

In [4]:
#first method of disaggregation, divide the cases evenly among all the districts
#firstly, extract the total number of cases in DKI Jakarta, so high['Cases'] when high['Province'] == 'DKI Jakarta', for every year
df_jakarta = low[low['Province'] == 'DKI Jakarta']

df_jakarta.head()

Unnamed: 0,Province,Year,Cases,Population,HDI,Area_sq_km,Pop_den,urbanicity,geometry
0,DKI Jakarta,2020,1217,801232,0.82088,653.83,1.225444,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
1,DKI Jakarta,2021,1048,796729,0.8234,653.83,1.218557,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
2,DKI Jakarta,2022,2113,792228,0.82818,650.5,1.217875,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."
3,DKI Jakarta,2023,1562,787726,0.83372,650.25,1.21142,0.931345,"MULTIPOLYGON (((106.84094 -6.34198, 106.8407 -..."


In [5]:
#create a dataframe containing total number of cases in dengue in each year from dki jakarta n west java df
df_jakarta_yearly_total_cases = pd.DataFrame(df_jakarta.groupby('Year')['Cases'].sum())

In [6]:
#check head
df_jakarta_yearly_total_cases.head()

Unnamed: 0_level_0,Cases
Year,Unnamed: 1_level_1
2020,1217
2021,1048
2022,2113
2023,1562


### Disaggregation with equal proportions and cost calculations

In [7]:
#from this, extract the total number of districts in DKI Jakarta from high df, so count the number of unique values of district when province column = dki jakarta
df_jakarta_districts = len(pd.DataFrame(high[high['Province'] == 'DKI Jakarta']['District'].unique()))

In [8]:
print(df_jakarta_districts)

5


In [9]:
#list out all the district - level dengue cases estimates each year
year_2020_jkt = int(df_jakarta_yearly_total_cases.loc[2020] / df_jakarta_districts)
year_2021_jkt = int(df_jakarta_yearly_total_cases.loc[2021] / df_jakarta_districts)
year_2022_jkt = int(df_jakarta_yearly_total_cases.loc[2022] / df_jakarta_districts)
year_2023_jkt = int(df_jakarta_yearly_total_cases.loc[2023] / df_jakarta_districts)

  year_2020_jkt = int(df_jakarta_yearly_total_cases.loc[2020] / df_jakarta_districts)
  year_2021_jkt = int(df_jakarta_yearly_total_cases.loc[2021] / df_jakarta_districts)
  year_2022_jkt = int(df_jakarta_yearly_total_cases.loc[2022] / df_jakarta_districts)
  year_2023_jkt = int(df_jakarta_yearly_total_cases.loc[2023] / df_jakarta_districts)


In [10]:
#append the disaggregated counts to the high_res file ina  new column called cases_bm_equal, only if the Province column is dki jakarta
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2020), 'cases_bm_equal'] = year_2020_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2021), 'cases_bm_equal'] = year_2021_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2022), 'cases_bm_equal'] = year_2022_jkt
high.loc[(high['Province'] == 'DKI Jakarta') & (high['Year'] == 2023), 'cases_bm_equal'] = year_2023_jkt

In [11]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0


In [12]:
#append the cases_bm_equal column to NBM table
cost = cost.merge(high[["Province", "District", "Year", "cases_bm_equal"]], on = ["Province", "District", "Year"], how = "left")

In [13]:
len(cost.dropna())

20

In [14]:
#calculate incremental cost for disaggregation with equal proportions
#aggregate again, calculate sum_cases based on cases_bm_equal predictions

cost['equal_bm_prevalence'] = cost['cases_bm_equal'] / cost['Population']

mean_obs_prev = cost["equal_bm_prevalence"].mean()
cost["prev_factor"] = cost["equal_bm_prevalence"] / mean_obs_prev


In [15]:
# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["equal_bm_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["equal_bm_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

### Population-proportion disaggregation and cost calculations

In [16]:
#get df containing yearly total populations
df_jakarta_yearly_total_pop = pd.DataFrame(df_jakarta.groupby('Year')['Population'].sum())
df_jakarta_yearly_total_pop.head()

Unnamed: 0_level_0,Population
Year,Unnamed: 1_level_1
2020,801232
2021,796729
2022,792228
2023,787726


In [17]:
#merge the two dfs by Year
df_jakarta_final = df_jakarta_yearly_total_cases.merge(df_jakarta_yearly_total_pop, on='Year')

#get another column called df_jakarta_final and df_java_final called Province, for merging
df_jakarta_final['Province'] = 'DKI Jakarta'

#append the suffix _Prov to both "Cases" and "Population" c0olumns in df_jakarta_final and df_java_final:
df_jakarta_final.columns = [col + '_Prov' if col in ['Cases', 'Population'] else col for col in df_jakarta_final.columns]

In [18]:
# Reset index of df_jakarta_final and df_java_final to make 'Year' a regular column
df_jakarta_final = df_jakarta_final.reset_index()

#check
df_jakarta_final.head()

Unnamed: 0,Year,Cases_Prov,Population_Prov,Province
0,2020,1217,801232,DKI Jakarta
1,2021,1048,796729,DKI Jakarta
2,2022,2113,792228,DKI Jakarta
3,2023,1562,787726,DKI Jakarta


In [19]:
#now append this to the high-res dataset: so it means that if the year is 2020 cases_prop_bm = high_res["Population"]/df_jakarta_final["Population"] * df_jakarta_final["Cases"]

#merge to the high_res df by Year and Province
high = pd.merge(high, df_jakarta_final[['Year', 'Province', 'Population_Prov', 'Cases_Prov']], on=['Year', 'Province'], how='left')


In [20]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal,Population_Prov,Cases_Prov
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0,801232,1217
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0,796729,1048
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0,792228,2113
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0,787726,1562
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0,801232,1217


In [21]:
#calculate the disaggregation by multiplying the total num of cases per province with the pop in the district / pop in the province
high['cases_prop_bm'] = (high['Population'] / high['Population_Prov']) * high['Cases_Prov']

In [22]:
#cost calculations
#merge this with cost df
cost = pd.merge(cost, high[['Province', 'District', 'Year', 'cases_prop_bm']], on=['Province', 'District', 'Year'], how='left')

In [23]:
len(cost.dropna())

20

In [24]:
#calculate nbm for disaggregation with population-based proportions
# Scale total cost to produce district-level cost per year based on cases per district per year / total num of cases over 2020-2024 nationwide, assuming that healthcare costs for dengue treatment is linearly related to the number of dengue cases per year
# this cost factor measures the scale of the district-level, yearly dengue cases to the national-level dengue cases in total from 2020-2024.
cost['prop_bm_prevalence'] = cost['cases_prop_bm'] / cost['Population']

mean_obs_prev = cost["prop_bm_prevalence"].mean()
cost["prev_factor"] = cost["prop_bm_prevalence"] / mean_obs_prev

# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [25]:
cost.head()

Unnamed: 0.1,Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,...,obs_incremental_healthcare_cost,obs_incremental_payer_cost,cases_bm_equal,equal_bm_prevalence,equal_bm_incremental_healthcare_cost,equal_bm_incremental_payer_cost,cases_prop_bm,prop_bm_prevalence,incremental_healthcare_cost,incremental_payer_cost
0,0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,...,141.14232,141.124821,243.0,0.001315,183.574348,183.551589,280.734222,0.001519,246.085096,246.054587
1,1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,...,120.979131,120.964133,209.0,0.00114,157.889048,157.869474,241.069372,0.001315,211.31581,211.289612
2,2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,...,366.547831,366.502388,422.0,0.002322,317.371673,317.332327,484.663247,0.002667,422.941237,422.888803
3,3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,...,313.311122,313.272279,312.0,0.001732,234.64446,234.61537,357.241424,0.001983,311.746621,311.707973
4,4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,...,276.060659,276.026434,243.0,0.003322,455.178164,455.121733,111.093142,0.001519,241.460796,241.43086


### Random-forest-based (hdi and pop den) disaggregation and cost calculations

In [26]:
#merge the rf_preds cases to the high and nbm dfs:
#rename Predicted_Cases to rf_preds
rf_preds = rf_preds.rename(columns={'Predicted_Cases': 'rf_preds'})
high = pd.merge(high, rf_preds[["HDI","urbanicity", "rf_preds"]], on = ["HDI","urbanicity"], how='left')

In [27]:
len(rf_preds['rf_preds'].unique())

6

In [28]:
#check the columns for high
high.dropna(subset=['rf_preds'])["rf_preds"].shape[0]

6

In [29]:
high.head()

Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,cases_bm_equal,Population_Prov,Cases_Prov,cases_prop_bm,rf_preds
0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",243.0,801232,1217,280.734222,183.108
1,KOTA JAKARTA BARAT,2021,124.44,0.8176,DKI Jakarta,138,183270,1.472758,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",209.0,796729,1048,241.069372,
2,KOTA JAKARTA BARAT,2022,125.0,0.8251,DKI Jakarta,420,181715,1.45372,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",422.0,792228,2113,484.663247,
3,KOTA JAKARTA BARAT,2023,125.0,0.8302,DKI Jakarta,359,180159,1.441272,0.951917,"POLYGON ((106.70503 -6.0956, 106.70526 -6.0956...",312.0,787726,1562,357.241424,
4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,"POLYGON ((106.83247 -6.14138, 106.83282 -6.141...",243.0,801232,1217,111.093142,


In [30]:
cost = cost.merge(high[["Year", "District", "rf_preds"]], on = ["Year", "District"], how = "left")

In [31]:
cost.isna().sum()

Unnamed: 0                               0
District                                 0
Year                                     0
Area_sq_km                               0
HDI                                      0
Province                                 0
Cases                                    0
Population                               0
Pop_den                                  0
urbanicity                               0
geometry                                 0
le_factor                                0
burden_non_vaccine                       0
burden_vaccine                           0
healthcare_cost_non_vaccine              0
healthcare_cost_vaccine                  0
payer_cost_non_vaccine                   0
payer_cost_vaccine                       0
mean_urbanicity                          0
mean_pop_den                             0
urbanicity_factor                        0
pop_den_factor                           0
obs_prevalence                           0
prev_factor

In [32]:
# calculate nbm for rf
# Scale total cost to produce district-level cost per year based on cases per district per year / total num of cases over 2020-2024 nationwide, assuming that healthcare costs for dengue treatment is linearly related to the number of dengue cases per year
# this cost factor measures the scale of the district-level, yearly dengue cases to the national-level dengue cases in total from 2020-2024.
cost['rf_prevalence'] = cost['rf_preds'] / cost["Population"]


mean_obs_prev = cost["rf_prevalence"].mean()
cost["prev_factor"] = cost["rf_prevalence"] / mean_obs_prev

# Scale costs
# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["rf_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["rf_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [33]:
print(cost.shape[0])


20


### aggGP-based disaggregation and cost calculations

In [34]:
#read in the gp-preds data for all years and append them
gp_2020 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2020.csv")
gp_2021 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2021.csv")
gp_2022 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2022.csv")
gp_2023 = pd.read_csv("../data/processed/df_hi_jkt_w_pred_2023.csv")

In [35]:
gp_2020.head()

Unnamed: 0.1,Unnamed: 0,District,Year,Area_sq_km,HDI,Province,Cases,Population,Pop_den,urbanicity,geometry,incidence,pred_cases_num,pred_cases
0,0,KOTA JAKARTA BARAT,2020,124.44,0.8138,DKI Jakarta,161,184826,1.485262,0.951917,POLYGON ((106.70503370000006 -6.09559559999996...,0.000871,331.46472,0.001793
1,4,KOTA JAKARTA PUSAT,2020,52.38,0.8139,DKI Jakarta,127,73140,1.396334,0.993514,POLYGON ((106.83246630000008 -6.14137699999997...,0.001736,219.14929,0.002996
2,8,KOTA JAKARTA SELATAN,2020,154.32,0.8472,DKI Jakarta,288,160968,1.043079,0.972088,POLYGON ((106.83399450000007 -6.20492189999993...,0.001789,379.68277,0.002359
3,12,KOTA JAKARTA TIMUR,2020,182.7,0.8266,DKI Jakarta,396,238497,1.305402,0.930291,POLYGON ((106.94010940000004 -6.15492309999996...,0.00166,468.19147,0.001963
4,16,KOTA JAKARTA UTARA,2020,139.99,0.8029,DKI Jakarta,245,143801,1.027223,0.850775,MULTIPOLYGON (((106.87676710000005 -6.10030039...,0.001704,420.9327,0.002927


In [36]:
#delete the first column for all the datasets and reset index, also combine all the predictons into one df
# Define function to clean each DataFrame
def clean_df(df):
    df = df.drop(columns=["Unnamed: 0"], errors="ignore")
    return df.reset_index(drop=True)

# Clean all
gp_2020 = clean_df(gp_2020)
gp_2021 = clean_df(gp_2021)
gp_2022 = clean_df(gp_2022)
gp_2023 = clean_df(gp_2023)

# Combine
gp_all_years = pd.concat([gp_2020, gp_2021, gp_2022, gp_2023], axis=0).reset_index(drop=True)


In [37]:
# Calculate the gp_pred_cases from pred_cases * Population
# Rename 'pred_cases' to 'gp_prevalence'
# Rename "pred_cases_num" to "gp_preds"
gp_all_years = gp_all_years.rename(columns={"pred_cases": "gp_prevalence", "pred_cases_num": "gp_preds"})

# Subset the relevant columns
gp_subset = gp_all_years[["District", "Year", "gp_prevalence", "gp_preds"]]

# Merge by District and Year
cost = pd.merge(cost, gp_subset, on=["District", "Year"], how="left")


In [38]:
# Scale costs
mean_obs_prev = cost["gp_prevalence"].mean()
cost["prev_factor"] = cost["gp_prevalence"] / mean_obs_prev

# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["gp_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["gp_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

### aggVAE-based disaggregation and cost calculations

In [39]:
#read in the gp-preds data for all years and append them
vae_2020 = pd.read_csv("../data/processed/df_hi_jkt_2020_aggVAE_preds.csv")
vae_2021 = pd.read_csv("../data/processed/df_hi_jkt_2021_aggVAE_preds.csv")
vae_2022 = pd.read_csv("../data/processed/df_hi_jkt_2022_aggVAE_preds.csv")
vae_2023 = pd.read_csv("../data/processed/df_hi_jkt_2023_aggVAE_preds.csv")

In [40]:
#delete the first column for all the datasets and reset index, also combine all the predictons into one df
# Define function to clean each DataFrame
def clean_df(df):
    df = df.drop(columns=["Unnamed: 0"], errors="ignore")
    return df.reset_index(drop=True)

# Clean all
vae_2020 = clean_df(vae_2020)
vae_2021 = clean_df(vae_2021)
vae_2022 = clean_df(vae_2022)
vae_2023 = clean_df(vae_2023)

# Combine
vae_all_years = pd.concat([vae_2020, vae_2021, vae_2022, vae_2023], axis=0).reset_index(drop=True)


In [41]:
# Rename 'pred_cases' to 'vae_prevalence'
# Rename "pred_cases_num" to "gp_preds"
vae_all_years = vae_all_years.rename(columns={"pred_cases": "vae_prevalence", "pred_cases_num": "vae_preds"})

# Subset the relevant columns
vae_subset = vae_all_years[["District", "Year", "vae_prevalence", "vae_preds"]]

# Merge by District and Year
cost = pd.merge(cost, vae_subset, on=["District", "Year"], how="left")

In [42]:
# Scale costs
# Scale costs
mean_obs_prev = cost["vae_prevalence"].mean()
cost["prev_factor"] = cost["vae_prevalence"] / mean_obs_prev

# Incremental cost per district, yearly (scaled by no of dengue cases in each district)
cost["vae_incremental_healthcare_cost"] = (
    cost["healthcare_cost_vaccine"] - cost["healthcare_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

cost["vae_incremental_payer_cost"] = (
    cost["payer_cost_vaccine"] - cost["payer_cost_non_vaccine"]) * cost["prev_factor"] * cost["urbanicity_factor"] * cost["pop_den_factor"]

In [43]:
cost = cost.reset_index(drop=True)


In [44]:
cost = cost.drop(cost.columns[0], axis=1)


In [45]:
print(len(cost))          # Should be 20
print(cost.shape[0])      # Also 20
print(cost.index)         # Will show RangeIndex(start=0, stop=20, step=1)


20
20
RangeIndex(start=0, stop=20, step=1)


In [46]:
cost.filter(like="rf").isna().sum()

rf_preds                          14
rf_prevalence                     14
rf_incremental_healthcare_cost    14
rf_incremental_payer_cost         14
dtype: int64

In [47]:
print(cost.obs_incremental_healthcare_cost.tolist())


[141.14231950806925, 120.9791310069165, 366.5478310869733, 313.3111222862463, 276.06065872985715, 117.38012260954554, 474.012328603644, 363.88825226138346, 207.90667211272472, 202.85338494331825, 410.4413938524525, 229.0478190412564, 231.08288330736423, 226.99808486506237, 321.2072911142632, 278.1115007143171, 170.63826141352365, 129.54578213434866, 265.8025666328673, 178.1649679386273]


In [48]:
print(cost.gp_incremental_healthcare_cost.tolist())

[222.03425021898738, 229.41368474768205, 217.37389961515322, 229.7735346617627, 363.9922664074901, 369.7656656787158, 393.55382185045914, 403.6125815203996, 209.43444669505286, 217.5618151374742, 219.6877906724488, 230.56446642609845, 208.76004023156148, 217.4102675463805, 203.58235898141723, 211.99602906178, 224.01355224755167, 232.40806571566873, 209.21931202246716, 219.64301725496597]


## Evaluation without RF included

### Evaluate the raw case counts --> table 12 values

In [49]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Define model predictions for dengue case counts
model_preds = {
    'BM Equal': 'cases_bm_equal',
    'BM Proportional': 'cases_prop_bm',
    'GP': 'gp_preds',
    'aggVAE': 'vae_preds'
}

# Store results
results = []

for model_name, pred_col in model_preds.items():
    y_true = cost['Cases'].values  # ensure it's a numpy array
    y_pred = cost[pred_col].values

    mae = np.abs(y_pred - y_true)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    bias = y_pred - y_true

    # Compute MAE (%) and Bias (%) per point, then average
    mae_pct = np.mean((mae / y_true) * 100)
    bias_pct = np.mean((bias / y_true) * 100)

    results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed cases)': mae_pct,
        'Bias (% of observed cases)': bias_pct
    })

# Convert to DataFrame
metrics_df = pd.DataFrame(results)

# View overall averages for each model (though here it's already model-level)
summary = metrics_df[['Model', 'RMSE', 'MAE (% of observed cases)', 'Bias (% of observed cases)']]

print(summary)


             Model        RMSE  MAE (% of observed cases)  \
0         BM Equal  112.722669                  47.936749   
1  BM Proportional   57.060756                  19.666118   
2               GP  127.052619                  59.240684   
3           aggVAE   91.233406                  37.097943   

   Bias (% of observed cases)  
0                   25.828358  
1                    6.965937  
2                   51.875352  
3                   30.108515  


### Evaluate the disease prevalence using the same metrics but + Spearman's for ranking preservation metric! --> table 13 values

In [50]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Define prevalence prediction columns
prevalence_preds = {
    'BM Equal': 'equal_bm_prevalence',
    'BM Proportional': 'prop_bm_prevalence',
    'GP': 'gp_prevalence',
    'aggVAE': 'vae_prevalence'
}

# Store prevalence evaluation results
prevalence_results = []

for model_name, pred_col in prevalence_preds.items():
    y_true = cost['obs_prevalence'].values
    y_pred = cost[pred_col].values

    # RMSE: compute normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Spearman correlation
    if np.unique(y_true).size > 1 and np.unique(y_pred).size > 1:
        spearman_corr, _ = spearmanr(y_true, y_pred)
    else:
        spearman_corr = np.nan  # Not computable if constant series

    # MAE and Bias (as percentage of *observed prevalence* at each point)
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely in case y_true = 0
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    prevalence_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed prevalence)': mae_pct,
        'Spearman': spearman_corr,
        'Bias (% of observed prevalence)': bias_pct
    })

# Convert to DataFrame
prevalence_metrics_df = pd.DataFrame(prevalence_results)

# View model-level averages (already one row per model now)
prevalence_summary = prevalence_metrics_df[['Model', 'RMSE', 'MAE (% of observed prevalence)', 'Spearman', 'Bias (% of observed prevalence)']]

print(prevalence_summary)

             Model      RMSE  MAE (% of observed prevalence)  Spearman  \
0         BM Equal  0.001098                       47.936749  0.551880   
1  BM Proportional  0.000342                       19.666118  0.887126   
2               GP  0.000967                       59.240684 -0.063158   
3           aggVAE  0.000679                       37.097944  0.485714   

   Bias (% of observed prevalence)  
0                        25.828358  
1                         6.965937  
2                        51.875352  
3                        30.108515  


### Evaluate the incremental cost (healthcare + payer's perspectives separately) --> tables 15 & 14 values

In [51]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Define incremental cost prediction columns (for healthcare and payer)
cost_preds = {
    'BM Equal': 'equal_bm_incremental_healthcare_cost',
    'BM Proportional': 'incremental_healthcare_cost',
    'GP': 'gp_incremental_healthcare_cost',
    'aggVAE': 'vae_incremental_healthcare_cost'
}

payer_preds = {
    'BM Equal': 'equal_bm_incremental_payer_cost',
    'BM Proportional': 'incremental_payer_cost',
    'GP': 'gp_incremental_payer_cost',
    'aggVAE': 'vae_incremental_payer_cost'
}

# Store evaluation results
cost_results = []
payer_results = []

# Healthcare cost evaluation
for model_name, pred_col in cost_preds.items():
    y_true = cost['healthcare_cost_vaccine'].values
    y_pred = cost[pred_col].values

    # RMSE computed normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # MAE and Bias
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely (exclude where observed cost = 0)
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    cost_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed healthcare cost)': mae_pct,
        'Bias (% of observed healthcare cost)': bias_pct
    })

# Payer cost evaluation
for model_name, pred_col in payer_preds.items():
    y_true = cost['payer_cost_vaccine'].values
    y_pred = cost[pred_col].values

    # RMSE computed normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # MAE and Bias
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    payer_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed payer cost)': mae_pct,
        'Bias (% of observed payer cost)': bias_pct
    })

# Convert to DataFrames
cost_metrics_df = pd.DataFrame(cost_results)
payer_metrics_df = pd.DataFrame(payer_results)

# Output
print(cost_metrics_df)
print(payer_metrics_df)


             Model        RMSE  MAE (% of observed healthcare cost)  \
0         BM Equal  190.008921                            51.481831   
1  BM Proportional   90.919910                            28.469961   
2               GP   68.126495                            23.082442   
3           aggVAE   58.601883                            20.166011   

   Bias (% of observed healthcare cost)  
0                             -0.734312  
1                             -4.528811  
2                             -5.760577  
3                             -7.730157  
             Model        RMSE  MAE (% of observed payer cost)  \
0         BM Equal  190.008520                       51.724036   
1  BM Proportional   90.340342                       28.437685   
2               GP   67.087796                       22.303679   
3           aggVAE   56.885857                       20.169513   

   Bias (% of observed payer cost)  
0                         1.359663  
1                        -2.5

## Evaluation with RF included

In [52]:
cost = cost.dropna(subset = ['rf_preds'])

### Evaluate the raw case counts --> table 12

In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Define model predictions for dengue case counts
model_preds = {
    'BM Equal': 'cases_bm_equal',
    'BM Proportional': 'cases_prop_bm',
    "RF": "rf_preds",
    'GP': 'gp_preds',
    'aggVAE': 'vae_preds'
}

# Store results
results = []

for model_name, pred_col in model_preds.items():
    y_true = cost['Cases'].values  # ensure it's a numpy array
    y_pred = cost[pred_col].values

    mae = np.abs(y_pred - y_true)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    bias = y_pred - y_true

    # Compute MAE (%) and Bias (%) per point, then average
    mae_pct = np.mean((mae / y_true) * 100)
    bias_pct = np.mean((bias / y_true) * 100)

    results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed cases)': mae_pct,
        'Bias (% of observed cases)': bias_pct
    })

# Convert to DataFrame
metrics_df = pd.DataFrame(results)

# View overall averages for each model (though here it's already model-level)
summary = metrics_df[['Model', 'RMSE', 'MAE (% of observed cases)', 'Bias (% of observed cases)']]

print(summary)


             Model        RMSE  MAE (% of observed cases)  \
0         BM Equal  123.002710                  84.324282   
1  BM Proportional   64.120908                  33.732056   
2               RF   88.062370                  59.861857   
3               GP  136.187877                  98.715945   
4           aggVAE   86.858821                  57.511871   

   Bias (% of observed cases)  
0                   63.419675  
1                   17.336110  
2                   59.649204  
3                   98.715945  
4                   55.286047  


### Evaluate the disease prevalence using the same metrics but + Spearman's for ranking preservation metric! --> table 13

In [54]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Define prevalence prediction columns
prevalence_preds = {
    'BM Equal': 'equal_bm_prevalence',
    'BM Proportional': 'prop_bm_prevalence',
    "RF": "rf_prevalence",
    'GP': 'gp_prevalence',
    'aggVAE': 'vae_prevalence'
}

# Store prevalence evaluation results
prevalence_results = []

for model_name, pred_col in prevalence_preds.items():
    y_true = cost['obs_prevalence'].values
    y_pred = cost[pred_col].values

    # RMSE: compute normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Spearman correlation
    if np.unique(y_true).size > 1 and np.unique(y_pred).size > 1:
        spearman_corr, _ = spearmanr(y_true, y_pred)
    else:
        spearman_corr = np.nan  # Not computable if constant series

    # MAE and Bias (as percentage of *observed prevalence* at each point)
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely in case y_true = 0
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    prevalence_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed prevalence)': mae_pct,
        'Spearman': spearman_corr,
        'Bias (% of observed prevalence)': bias_pct
    })

# Convert to DataFrame
prevalence_metrics_df = pd.DataFrame(prevalence_results)

# View model-level averages (already one row per model now)
prevalence_summary = prevalence_metrics_df[['Model', 'RMSE', 'MAE (% of observed prevalence)', 'Spearman', 'Bias (% of observed prevalence)']]

print(prevalence_summary)

             Model      RMSE  MAE (% of observed prevalence)  Spearman  \
0         BM Equal  0.001290                       84.324282  0.314286   
1  BM Proportional  0.000403                       33.732056  0.588490   
2               RF  0.001169                       59.861857  0.371429   
3               GP  0.001223                       98.715945  0.085714   
4           aggVAE  0.000780                       57.511873  0.142857   

   Bias (% of observed prevalence)  
0                        63.419675  
1                        17.336110  
2                        59.649204  
3                        98.715945  
4                        55.286048  


### Evaluate the incremental cost (healthcare + payer's perspectives separately) --> tables 14 & 15

In [55]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Define incremental cost prediction columns (for healthcare and payer)
cost_preds = {
    'BM Equal': 'equal_bm_incremental_healthcare_cost',
    'BM Proportional': 'incremental_healthcare_cost',
    "RF": "rf_incremental_healthcare_cost",
    'GP': 'gp_incremental_healthcare_cost',
    'aggVAE': 'vae_incremental_healthcare_cost'
}

payer_preds = {
    'BM Equal': 'equal_bm_incremental_payer_cost',
    'BM Proportional': 'incremental_payer_cost',
    "RF": "rf_incremental_payer_cost",
    'GP': 'gp_incremental_payer_cost',
    'aggVAE': 'vae_incremental_payer_cost'
}

# Store evaluation results
cost_results = []
payer_results = []

# Healthcare cost evaluation
for model_name, pred_col in cost_preds.items():
    y_true = cost['healthcare_cost_vaccine'].values
    y_pred = cost[pred_col].values

    # RMSE computed normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # MAE and Bias
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely (exclude where observed cost = 0)
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    cost_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed healthcare cost)': mae_pct,
        'Bias (% of observed healthcare cost)': bias_pct
    })

# Payer cost evaluation
for model_name, pred_col in payer_preds.items():
    y_true = cost['payer_cost_vaccine'].values
    y_pred = cost[pred_col].values

    # RMSE computed normally
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # MAE and Bias
    mae = np.abs(y_pred - y_true)
    bias = y_pred - y_true

    # Handle division safely
    valid_idx = y_true != 0
    mae_pct = np.mean((mae[valid_idx] / y_true[valid_idx]) * 100)
    bias_pct = np.mean((bias[valid_idx] / y_true[valid_idx]) * 100)

    payer_results.append({
        'Model': model_name,
        'RMSE': rmse,
        'MAE (% of observed payer cost)': mae_pct,
        'Bias (% of observed payer cost)': bias_pct
    })

# Convert to DataFrames
cost_metrics_df = pd.DataFrame(cost_results)
payer_metrics_df = pd.DataFrame(payer_results)

# Output
print(cost_metrics_df)
print(payer_metrics_df)


             Model        RMSE  MAE (% of observed healthcare cost)  \
0         BM Equal  190.889361                            61.381769   
1  BM Proportional   81.038232                            28.247855   
2               RF  185.481929                            51.913867   
3               GP   80.477586                            27.123203   
4           aggVAE   61.658997                            21.799482   

   Bias (% of observed healthcare cost)  
0                              1.146936  
1                            -17.758242  
2                              4.106988  
3                              2.734568  
4                            -12.108182  
             Model        RMSE  MAE (% of observed payer cost)  \
0         BM Equal  191.033877                       61.973441   
1  BM Proportional   77.927969                       27.437423   
2               RF  185.866781                       52.305817   
3               GP   81.154439                       26.9