In [1]:
import pandas as pd
import numpy as np
import os
import sys
import itertools
from idd_forecast_mbp import constants as rfc
from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids

PROCESSED_DATA_PATH = rfc.MODEL_ROOT / "02-processed_data"
MODELING_DATA_PATH = rfc.MODEL_ROOT / "03-modeling_data"
FHS_DATA_PATH = f"{PROCESSED_DATA_PATH}/age_specific_fhs"

cause = "dengue"
cause_id = rfc.cause_map[cause]["cause_id"]

aa_fhs_data_path_template = "{FHS_DATA_PATH}/aa_cause_id_{cause_id}_measure_id_{measure_id}_metric_id_{metric_id}_fhs.parquet"
as_fhs_data_path_template = "{FHS_DATA_PATH}/as_cause_id_{cause_id}_measure_id_{measure_id}_metric_id_{metric_id}_fhs.parquet"

In [20]:
as_lsae_population_path = f"{MODELING_DATA_PATH}/as_lsae_population_df.parquet"

age_metadata_path = f"{FHS_DATA_PATH}/age_metadata.parquet"

dengue_stage_2_df_path = f"{MODELING_DATA_PATH}/dengue_stage_2_modeling_df.parquet"
df = read_parquet_with_integer_ids(dengue_stage_2_df_path)

In [25]:
df[(df["location_id"] == 50558) & (df["year_id"] == 2002)]

Unnamed: 0,location_id,year_id,most_detailed_lsae,gbd_location_id,population,aa_dengue_mort_count,aa_dengue_inc_count,aa_dengue_mort_rate,aa_dengue_inc_rate,aa_dengue_cfr,...,dengue_suitability,people_flood_days,people_flood_days_per_capita,log_gdppc_mean,logit_urban_1km_threshold_300,logit_urban_100m_threshold_300,logit_urban_1km_threshold_1500,logit_urban_100m_threshold_1500,yn,A0_af
5131,50558,2002,1,4756,2217502.75,2.945136,36925.029872,1e-06,0.016652,8e-05,...,82.908051,948543.8125,0.427753,9.769827,2.722538,4.493297,1.252636,1.923876,1,A0_135


In [27]:


year_ids = df.year_id.unique()
lsae_location_ids = df.location_id.unique()
fhs_location_ids = df.fhs_location_id.unique()

year_filter = ('year_id', 'in', year_ids)
lsae_location_filter = ('location_id', 'in', lsae_location_ids)
fhs_location_filter = ('location_id', 'in', fhs_location_ids)

as_lsae_population = pd.read_parquet(as_lsae_population_path,
                                     filters=[[year_filter, lsae_location_filter]])

hierarchy_df_path = f'{PROCESSED_DATA_PATH}/full_hierarchy_lsae_1209.parquet'
hierarchy_df = read_parquet_with_integer_ids(hierarchy_df_path)

age_metadata_df = read_parquet_with_integer_ids(age_metadata_path)
age_group_ids = age_metadata_df["age_group_id"].unique()
sex_ids = [1, 2]  # 1
combinations = list(itertools.product(age_group_ids, sex_ids))
as_df = pd.DataFrame(combinations, columns=['age_group_id', 'sex_id'])

In [None]:
df.rename(
    columns={
        "population": "aa_population",
    },
    inplace=True,
)

df = df.merge(
    as_df,
    how = "cross")

df = df.merge(
    as_lsae_population[["location_id", "year_id", "age_group_id", "sex_id", "population"]],
    how = "left",
    on = ["location_id", "year_id", "age_group_id", "sex_id"])

In [35]:
as_lsae_population[(as_lsae_population["location_id"] == 50558) & (as_lsae_population["year_id"] == 2002) & (as_lsae_population["age_group_id"] == 235)]

Unnamed: 0,location_id,year_id,population_aa,age_group_id,sex_id,fhs_location_id,pop_fraction_aa,population
117442,50558,2002,2217502.75,235,1,4756,1.170022e-07,0.259453
117443,50558,2002,2217502.75,235,2,4756,1.164687e-06,2.582697


In [None]:


# Get dengue mortality counts for age specific
as_fhs_df_path = as_fhs_data_path_template.format(
    FHS_DATA_PATH=FHS_DATA_PATH,
    cause_id=cause_id,
    measure_id=1,
    metric_id=1
)
as_fhs_df = read_parquet_with_integer_ids(as_fhs_df_path,
                                     filters=[[year_filter, fhs_location_filter]])
# Rename location_id to fhs_location_id

as_fhs_df = as_fhs_df.rename(columns={
    "location_id": "fhs_location_id",
    "val": "as_fhs_dengue_mort_count"})

df = df.merge(
    as_fhs_df[["fhs_location_id", "year_id",
               "age_group_id","sex_id",
               "as_fhs_dengue_mort_count"]],
    how = "left",
    on = ["fhs_location_id", "year_id", 
          "age_group_id", "sex_id"])


# Get dengue mortality counts for age specific
aa_fhs_df_path = aa_fhs_data_path_template.format(
    FHS_DATA_PATH=FHS_DATA_PATH,
    cause_id=cause_id,
    measure_id=1,
    metric_id=1
)
aa_fhs_df = read_parquet_with_integer_ids(aa_fhs_df_path,
                                     filters=[[year_filter, fhs_location_filter]])
# Rename location_id to fhs_location_id
aa_fhs_df = aa_fhs_df.rename(columns={
    "location_id": "fhs_location_id",
    "val": "aa_fhs_dengue_mort_count"})

df = df.merge(
    aa_fhs_df[["fhs_location_id", "year_id",
                       "aa_fhs_dengue_mort_count"]],
    how = "left",
    on = ["fhs_location_id", "year_id"])



In [14]:
df[(df["location_id"] == 50558) & (df["year_id"] == 2002) & (df["age_group_id"] == 235) & (df["sex_id"] == 1)]["fhs_location_id"]

256592    4756
Name: fhs_location_id, dtype: int64

In [29]:
# Get dengue incidence counts for age specific
as_fhs_df_path = as_fhs_data_path_template.format(
    FHS_DATA_PATH=FHS_DATA_PATH,
    cause_id=cause_id,
    measure_id=6,
    metric_id=1
)
as_fhs_df = read_parquet_with_integer_ids(as_fhs_df_path,
                                     filters=[[year_filter, fhs_location_filter]])
# Rename location_id to fhs_location_id
as_fhs_df = as_fhs_df.rename(columns={
    "location_id": "fhs_location_id",
    "val": "as_fhs_dengue_inc_count"})



# Get dengue incidence counts for age specific
aa_fhs_df_path = aa_fhs_data_path_template.format(
    FHS_DATA_PATH=FHS_DATA_PATH,
    cause_id=cause_id,
    measure_id=6,
    metric_id=1
)
aa_fhs_df = read_parquet_with_integer_ids(aa_fhs_df_path,
                                     filters=[[year_filter, fhs_location_filter]])
# Rename location_id to fhs_location_id
aa_fhs_df = aa_fhs_df.rename(columns={
    "location_id": "fhs_location_id",
    "val": "aa_fhs_dengue_inc_count"})

In [30]:
as_fhs_df[(as_fhs_df["fhs_location_id"] == 4756) & (as_fhs_df["year_id"] == 2002) & (as_fhs_df["age_group_id"] == 235) & (as_fhs_df["sex_id"] == 1)]

Unnamed: 0,age_group_id,fhs_location_id,cause_id,measure_id,metric_id,sex_id,year_id,as_fhs_dengue_inc_count,upper,lower,parent_id,path_to_top_parent,level,most_detailed,location_name,super_region_id,super_region_name,region_id,region_name,age_group_name
169423,235,4756,357,6,1,1,2002,9.718489,13.534016,6.732197,135,11031341354756,4,1,Distrito Federal,103,Latin America and Caribbean,134,Tropical Latin America,95 plus


In [31]:
aa_fhs_df[(aa_fhs_df["fhs_location_id"] == 4756) & (aa_fhs_df["year_id"] == 2002)]

Unnamed: 0,fhs_location_id,age_group_id,cause_id,measure_id,metric_id,sex_id,year_id,aa_fhs_dengue_inc_count,upper,lower,parent_id,path_to_top_parent,level,most_detailed,location_name,super_region_id,super_region_name,region_id,region_name,age_group_name
2348,4756,22,357,6,1,3,2002,36925.029872,51799.229946,25651.94047,135,11031341354756,4,1,Distrito Federal,103,Latin America and Caribbean,134,Tropical Latin America,All Age


In [32]:


df = df.merge(
    as_fhs_df[["fhs_location_id", "year_id",
               "age_group_id","sex_id",
               "as_fhs_dengue_inc_count"]],
    how = "left",
    on = ["fhs_location_id", "year_id", 
          "age_group_id", "sex_id"])

df = df.merge(
    aa_fhs_df[["fhs_location_id", "year_id",
                       "aa_fhs_dengue_inc_count"]],
    how = "left",
    on = ["fhs_location_id", "year_id"])


df["as_dengue_inc_fraction"] = df["as_fhs_dengue_inc_count"] / df["aa_fhs_dengue_inc_count"]
df["as_dengue_mort_fraction"] = df["as_fhs_dengue_mort_count"] / df["aa_fhs_dengue_mort_count"]

df["as_dengue_mort_count"] = df["as_dengue_mort_fraction"] * df["aa_dengue_mort_count"]
df["as_dengue_inc_count"] = df["as_dengue_inc_fraction"] * df["aa_dengue_inc_count"]


#
df["as_dengue_inc_rate"] = df["as_dengue_inc_count"] / df["population"]
df["as_dengue_mort_rate"] = df["as_dengue_mort_count"] / df["population"] 

In [33]:
df[(df["location_id"] == 50558) & (df["year_id"] == 2002) & (df["age_group_id"] == 235) & (df["sex_id"] == 1)][["aa_dengue_inc_count", "as_dengue_inc_count", "as_dengue_inc_fraction", "population", "as_dengue_inc_rate", "aa_population", "population"]]

Unnamed: 0,aa_dengue_inc_count,as_dengue_inc_count,as_dengue_inc_fraction,population,as_dengue_inc_rate,aa_population,population.1
256592,36925.029872,9.718489,0.000263,0.259453,37.457643,2217502.75,0.259453


In [None]:






# Drop all columns that have fhs_dengue in them
drop_columns = [col for col in df.columns if "fhs_dengue" in col]
df = df.drop(columns=drop_columns)

df["as_dengue_cfr"] = df["as_dengue_mort_count"] / df["as_dengue_inc_count"]

# Set the CFR to 0 if dengue_inc_count is 0
df.loc[df["as_dengue_inc_count"] == 0, "as_dengue_cfr"] = 0


In [8]:
df[(df["location_id"] == 50558) & (df["year_id"] == 2002) & (df["age_group_id"] == 235) & (df["sex_id"] == 1)][["aa_dengue_inc_count", "as_dengue_inc_count", "as_dengue_inc_fraction", "population", "as_dengue_inc_rate", "aa_population"]]

Unnamed: 0,aa_dengue_inc_count,as_dengue_inc_count,as_dengue_inc_fraction,population,as_dengue_inc_rate,aa_population
256592,36925.029872,9.718489,0.000263,0.259453,37.457643,2217502.75


In [None]:

df.to_parquet(f"{MODELING_DATA_PATH}/as_dengue_stage_2_modeling_df.parquet", index=False)