In [123]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from rra_tools.shell_tools import mkdir  # type: ignore
from idd_forecast_mbp import constants as rfc
from idd_forecast_mbp.helper_functions import load_yaml_dictionary, parse_yaml_dictionary

In [124]:
A2_HIERARCHY = "lsae_1209"

GBD_INPUT_PATH = rfc.MODEL_ROOT / "01-raw_data" / "gbd"
A2_INPUT_PATH = rfc.MODEL_ROOT / "02-processed_data" / A2_HIERARCHY
OUTPUT_PATH = rfc.MODEL_ROOT / "02-processed_data" / "gbd"
HIERARCHY_PATH = "/mnt/team/rapidresponse/pub/population-model/admin-inputs/raking/gbd-inputs/hierarchy_{hierarchy}.parquet"

LSAE_population_path = "/mnt/team/rapidresponse/pub/climate-aggregates/2025_03_20/results/lsae_1209/population.parquet"
lsae_hierarchy_path = HIERARCHY_PATH.format(hierarchy=A2_HIERARCHY)
gbd_hierarchy_path = HIERARCHY_PATH.format(hierarchy="gbd_2023")

malaria_gbd_df_path = GBD_INPUT_PATH / "gbd_2023_malaria_aa.csv"
dengue_gbd_df_path = GBD_INPUT_PATH / "gbd_2023_dengue_aa.csv"

malaria_variables = {
    "pfpr": f"{A2_INPUT_PATH}/malaria_pfpr_mean_cc_insensitive.parquet",
    "incidence": f"{A2_INPUT_PATH}/malaria_pf_inc_rate_mean_cc_insensitive.parquet",
    "mortality": f"{A2_INPUT_PATH}/malaria_pf_mort_rate_mean_cc_insensitive.parquet",
}
dengue_variables = {
    "denv_suit": f"{A2_INPUT_PATH}/dengue_suitability_mean_cc_insensitive.parquet"
}

In [125]:
def split_path_to_top_parent(path):
    """
    Split the path_to_top_parent string into a list of integers
    """
    return [int(x) for x in path.split(",") if x.isdigit()]

In [126]:
lsae_hierarchy_df = pd.read_parquet(lsae_hierarchy_path)
lsae_most_detailed_hierarchy_df = lsae_hierarchy_df[lsae_hierarchy_df["most_detailed"] == 1].reset_index(drop=True)
gbd_hierarchy_df = pd.read_parquet(gbd_hierarchy_path)
gbd_most_detailed_hierarchy_df = gbd_hierarchy_df[gbd_hierarchy_df["most_detailed"] == 1]
lsae_path_to_hierarchy = lsae_most_detailed_hierarchy_df["path_to_top_parent"].apply(split_path_to_top_parent)

In [127]:
# For each row of gbd_most_detailed_hierarchy_df, find the rows of lsae_most_detailed_hierarchy_df that contain the location_id in the gbd row
gbd_to_lsae_mapping = {}

# Create a more efficient lookup structure
lsae_paths_dict = {}
for idx, row in lsae_most_detailed_hierarchy_df.iterrows():
    lsae_location_id = row['location_id']
    path = lsae_path_to_hierarchy.iloc[idx]
    lsae_paths_dict[lsae_location_id] = path

In [128]:
# For each GBD location_id, find LSAE locations that contain it in their path
for _, gbd_row in gbd_most_detailed_hierarchy_df.iterrows():
    gbd_location_id = gbd_row['location_id']
    matching_lsae_locations = []
    
    for lsae_location_id, path in lsae_paths_dict.items():
        if gbd_location_id in path:
            matching_lsae_locations.append(lsae_location_id)
    
    gbd_to_lsae_mapping[gbd_location_id] = matching_lsae_locations

# Convert to DataFrame for easier analysis
gbd_to_lsae_df = pd.DataFrame([
    {'gbd_location_id': gbd_id, 'lsae_location_id': lsae_id}
    for gbd_id, lsae_ids in gbd_to_lsae_mapping.items()
    for lsae_id in lsae_ids
])
# rename lsae_location_id to location_id
gbd_to_lsae_df.rename(columns={'lsae_location_id': 'location_id'}, inplace=True)

In [129]:
lsae_population_df = pd.read_parquet(LSAE_population_path)

In [130]:
def drop_scenario_population_mean(df):
    """
    Drop scenario, population, and any column with the word "mean" in it
    """
    df = df.drop(columns=["scenario", "population"])
    df = df.loc[:, ~df.columns.str.contains("mean")]
    return df

def rename_columns(df):
    """
    Rename columns in the df
    """
    for col in df.columns:
        if "_mean_per_capita" in col:
            df = df.rename(columns={col: col.replace("_mean_per_capita", "")})
        if "rate_mean" in col:
            df = df.rename(columns={col: col.replace("rate_mean", "count")})
    return df

In [131]:
## For malaria
# Load the malaria pfpr data (it is a parquet file)
malaria_pfpr = pd.read_parquet(malaria_variables["pfpr"])
# Rename malaria_pfpr_mean_per_capita to malaria_pfpr
malaria_pfpr.rename(columns={"malaria_pfpr_mean_per_capita": "malaria_pfpr"}, inplace=True)
malaria_pfpr = drop_scenario_population_mean(malaria_pfpr)
# Load the malaria incidence data
malaria_incidence = drop_scenario_population_mean(rename_columns(pd.read_parquet(malaria_variables["incidence"])))
# Load the malaria mortality data
malaria_mortality = drop_scenario_population_mean(rename_columns(pd.read_parquet(malaria_variables["mortality"])))
malaria_df = pd.merge(pd.merge(malaria_pfpr, malaria_incidence, on=["location_id", "year_id"]), malaria_mortality, on=["location_id", "year_id"])
# Write to parquet
malaria_df.to_parquet(OUTPUT_PATH / "raked_malaria_aa.parquet", index=False)

In [132]:
## For dengue
# Load the dengue suitability data
dengue_suit = pd.read_parquet(dengue_variables["denv_suit"])
dengue_suit.rename(columns={"dengue_suitability_mean_per_capita": "dengue_suit"}, inplace=True)
dengue_suit = drop_scenario_population_mean(dengue_suit)

In [133]:

# Subset dengue_suit to location ides in lsae_most_detailed_hierarchy_df
dengue_suit = dengue_suit[dengue_suit["location_id"].isin(lsae_most_detailed_hierarchy_df["location_id"])]
dengue_suit = dengue_suit.merge(lsae_population_df[["location_id", "year_id", "population"]], on=["location_id", "year_id"], how="left")
dengue_suit['dengue_suit_population'] = dengue_suit['dengue_suit'] * dengue_suit['population']
dengue_suit = dengue_suit.merge(gbd_to_lsae_df, on="location_id", how="left")

In [134]:
dengue_suit_by_gbd = dengue_suit.groupby(['year_id', 'gbd_location_id'])['dengue_suit_population'].sum().reset_index()
dengue_suit_by_gbd.rename(columns={'gbd_location_id': 'location_id'}, inplace=True)

In [135]:
# Load the dengue incidence data (dengue_gbd_df_path)
# It is a csv file, so we need to load it with pd.read_csv
dengue_gbd_df = pd.read_csv(dengue_gbd_df_path)
dengue_gbd_df = dengue_gbd_df[["location_id", "year_id", "most_detailed", "measure_id", "metric_id", "val"]]
# Remove all nan rows
dengue_gbd_df = dengue_gbd_df.dropna(subset=["location_id", "year_id", "most_detailed", "measure_id", "metric_id", "val"])
# Subset dengue_incidence to most_detailed = 1 locations
dengue_gbd_df = dengue_gbd_df[(dengue_gbd_df["most_detailed"] == 1) & 
                              (dengue_gbd_df["metric_id"] == 1) &
                              (dengue_gbd_df["measure_id"].isin([1,6]))].reset_index()  # 1 for mortality, 6 for incidence
# Get all location_ids from the dengue incidence data
# Drop the index most_detailed, and metric_id columns
dengue_gbd_df = dengue_gbd_df.drop(columns=["index", "most_detailed", "metric_id"])
dengue_suit_by_gbd = dengue_suit_by_gbd.merge(
    dengue_gbd_df[dengue_gbd_df['measure_id'] == 1].rename(columns={'val': 'denv_mort'}).drop(columns=['measure_id']), 
    on=["location_id", "year_id"], 
    how="left"
)
dengue_suit_by_gbd = dengue_suit_by_gbd.merge(
    dengue_gbd_df[dengue_gbd_df['measure_id'] == 6].rename(columns={'val': 'denv_inc'}).drop(columns=['measure_id']), 
    on=["location_id", "year_id"], 
    how="left"
)

  dengue_gbd_df = pd.read_csv(dengue_gbd_df_path)


In [136]:
dengue_suit_by_gbd['mort_rf'] = dengue_suit_by_gbd['denv_mort'] / dengue_suit_by_gbd['dengue_suit_population']
dengue_suit_by_gbd['inc_rf'] = dengue_suit_by_gbd['denv_inc'] / dengue_suit_by_gbd['dengue_suit_population']

In [137]:
dengue_suit_by_gbd['mort_rf'] = dengue_suit_by_gbd['denv_mort'] / dengue_suit_by_gbd['dengue_suit_population']
dengue_suit_by_gbd['inc_rf'] = dengue_suit_by_gbd['denv_inc'] / dengue_suit_by_gbd['dengue_suit_population']
# Set all places where dengue_suit_population is 0 to 0 for both mortality and incidence
dengue_suit_by_gbd.loc[dengue_suit_by_gbd['dengue_suit_population'] == 0, ['mort_rf', 'inc_rf']] = 0
# Drop the dengue_suit_population column
dengue_suit_by_gbd = dengue_suit_by_gbd.drop(columns=['dengue_suit_population', 'denv_mort', 'denv_inc'])
# Rename location_id to gbd_location_id
dengue_suit_by_gbd.rename(columns={'location_id': 'gbd_location_id'}, inplace=True)

In [138]:
dengue_df = dengue_suit.merge(dengue_suit_by_gbd, on=["gbd_location_id", "year_id"], how="left")

In [139]:
dengue_df['dengue_mort_count'] = dengue_df['dengue_suit_population'] * dengue_df['mort_rf']
dengue_df['dengue_inc_count'] = dengue_df['dengue_suit_population'] * dengue_df['inc_rf']
dengue_df['dengue_mort_rate'] = dengue_df['dengue_mort_count'] / dengue_df['population']
dengue_df['dengue_inc_rate'] = dengue_df['dengue_inc_count'] / dengue_df['population']
# Drop the dengue_suit_population column
dengue_df = dengue_df.drop(columns=['dengue_suit', 'dengue_suit_population', 'mort_rf', 'inc_rf'])

In [140]:
dengue_df.to_parquet(OUTPUT_PATH / "raked_dengue_aa.parquet", index=False)