In [None]:
################################################################
### AGE-SPECIFIC POPULATION DATA PREPARATION
################################################################

###----------------------------------------------------------###
### 1. Setup and Initialization
### Establishes the required libraries, constants, and mapping dictionaries.
### Defines the file paths and data structures for population processing.
###----------------------------------------------------------###
import pandas as pd
import numpy as np
import itertools
import os
import sys
import xarray as xr
from idd_forecast_mbp import constants as rfc
from idd_forecast_mbp.helper_functions import load_yaml_dictionary, parse_yaml_dictionary, read_parquet_with_integer_ids

age_type_map = {
    "all_age": {
        "name": "All Age",
        "age_type": "aa"
    },
    "age_specific": {
        "name": "Age-specific",
        "age_type": "as"
    }
}

RAW_DATA_PATH = rfc.MODEL_ROOT / "01-raw_data"

PROCESSED_DATA_PATH = rfc.MODEL_ROOT / "02-processed_data"
MODELING_DATA_PATH = rfc.MODEL_ROOT / "03-modeling_data"

FHS_DATA_PATH = f"{PROCESSED_DATA_PATH}/age_specific_fhs"
hierarchy_df_path = f'{PROCESSED_DATA_PATH}/full_hierarchy_2023_lsae_1209.parquet'
age_metadata_path = f"{FHS_DATA_PATH}/age_metadata.parquet"
fhs_hierarchy_df_path = f"{FHS_DATA_PATH}/fhs_hierarchy.parquet"

# lsae_population_path = "/mnt/team/rapidresponse/pub/climate-aggregates/2025_03_20/results/lsae_1209/population.parquet"
lsae_population_path = "/mnt/team/idd/pub/forecast-mbp/02-processed_data/GBD2023/lsae_1209/population.parquet"

###----------------------------------------------------------###
### 2. Hierarchy Data Loading
### Loads and filters geographic hierarchies for both LSAE and FHS systems.
### These hierarchies define the spatial structure for population allocation.
###----------------------------------------------------------###
# Load hierarchy data
hierarchy_df = read_parquet_with_integer_ids(hierarchy_df_path)
# hierarchy_df = hierarchy_df[hierarchy_df["level"] >= 3]
#
fhs_hierarchy_df = read_parquet_with_integer_ids(fhs_hierarchy_df_path)
# fhs_hierarchy_df = fhs_hierarchy_df[fhs_hierarchy_df["level"] >= 3]


In [45]:
GBD_DATA_PATH = f"{RAW_DATA_PATH}/gbd"
###----------------------------------------------------------###
### 2. Hierarchy Data Loading
### Loads and filters geographic hierarchies for both LSAE and FHS systems.
### These hierarchies define the spatial structure for population allocation.
###----------------------------------------------------------###
# Load hierarchy data
hierarchy_df = read_parquet_with_integer_ids(hierarchy_df_path)
# hierarchy_df = hierarchy_df[hierarchy_df["level"] >= 3]
#
fhs_hierarchy_df = read_parquet_with_integer_ids(fhs_hierarchy_df_path)
# fhs_hierarchy_df = fhs_hierarchy_df[fhs_hierarchy_df["level"] >= 3]

age_metadata_df = read_parquet_with_integer_ids(age_metadata_path)
age_group_ids = age_metadata_df["age_group_id"].unique().tolist()
age_group_filter = ('age_group_id', 'in', age_group_ids)
aa_age_group_filter = ('age_group_id', 'in', [22])  # All ages
sex_ids = [1, 2]
sex_filter = ('sex_id', 'in', sex_ids)
all_sex_filter = ('sex_id', 'in', [3])  # All sex

##################
# Past FHS Population Data Loading
##################
past_fhs_population_path = f"{GBD_DATA_PATH}/fhs_2023_population.parquet"
as_past_fhs_population_df = read_parquet_with_integer_ids(past_fhs_population_path,
    filters=[[age_group_filter, sex_filter]])

aa_past_fhs_population_df = read_parquet_with_integer_ids(past_fhs_population_path,
    filters=[[aa_age_group_filter, all_sex_filter]])

##################
# Future FHS Population Data Loading
##################
future_fhs_population_path = "/mnt/share/forecasting/data/33/future/population/20250319_updated_rerun_pop_shifted_etl_417/population.nc"
# Select only location_id 4756 and age_group_id 235
as_future_fhs_population = xr.open_dataset(future_fhs_population_path).sel(
    age_group_id=age_metadata_df["age_group_id"].unique(),
    sex_id=sex_ids
).population.mean(dim='draw')
as_future_fhs_population_df = as_future_fhs_population.to_dataframe().reset_index()
# drop the scenario
as_future_fhs_population_df = as_future_fhs_population_df.drop(columns=["scenario"])

aa_future_fhs_population = xr.open_dataset(future_fhs_population_path).sel(
    age_group_id=22,
    sex_id=3
).population.mean(dim='draw')
aa_future_fhs_population_df = aa_future_fhs_population.to_dataframe().reset_index()
# drop the scenario
aa_future_fhs_population_df = aa_future_fhs_population_df.drop(columns=["scenario"])

In [None]:
# Old location_id = 44858
# New location_ids = 60908; 95069; 94364

In [51]:
df_1 = as_future_fhs_population_df[as_future_fhs_population_df["location_id"] == 44858].copy()
df_1

Unnamed: 0,location_id,year_id,age_group_id,sex_id,population
1894200,44858,2024,2,1,7.651976e+03
1894201,44858,2024,2,2,7.297700e+03
1894202,44858,2024,3,1,2.260897e+04
1894203,44858,2024,3,2,2.169374e+04
1894204,44858,2024,6,1,1.563090e+06
...,...,...,...,...,...
1898045,44858,2100,238,2,2.253635e+05
1898046,44858,2100,388,1,1.006453e+05
1898047,44858,2100,388,2,9.560715e+04
1898048,44858,2100,389,1,1.177949e+05


In [49]:
df_1[(df_1["age_group_id"] == 2) & (df_1["sex_id"] == 1) & (df_1["year_id"] == 2022)]

Unnamed: 0,location_id,year_id,age_group_id,sex_id,population


In [43]:
future_fhs_population_path = "/mnt/share/forecasting/data/33/future/population/20250319_updated_rerun_pop_shifted_etl_417/population.nc"
# Select only location_id 4756 and age_group_id 235
as_future_fhs_population = xr.open_dataset(future_fhs_population_path).sel(
    age_group_id=age_metadata_df["age_group_id"].unique(),
    sex_id=sex_ids
).population.mean(dim='draw')
as_future_fhs_population_df = as_future_fhs_population.to_dataframe().reset_index()
# drop the scenario
as_future_fhs_population_df = as_future_fhs_population_df.drop(columns=["scenario"])

NameError: name 'sex_ids' is not defined

In [37]:
# read in only new location_ids from
fhs_2023_population_df_path = f"{RAW_DATA_PATH}/gbd/fhs_2023_population.parquet"
# New location_ids = 60908; 95069; 94364
fhs_2023_population_df = read_parquet_with_integer_ids(fhs_2023_population_df_path)
fhs_2023_population_df = fhs_2023_population_df[
    (fhs_2023_population_df["location_id"].isin([60908, 95069, 94364])) &
    (fhs_2023_population_df["sex_id"].isin([1,2])) &
    (fhs_2023_population_df["age_group_id"].isin(age_group_ids))
]


In [41]:
fhs_2023_population_df[(fhs_2023_population_df["year_id"] == 2022)]["population"].sum()

22682428.721401677

In [40]:
tmp_df = fhs_population_df[fhs_population_df["location_id"] == 44858].copy()
tmp_df[(tmp_df["year_id"] == 2022)]["population"].sum()

24217135.640529323

In [42]:
24217135.640529323 - 22682428.721401677

1534706.9191276468

In [15]:
sum_df = fhs_2023_population_df.groupby(["year_id", "age_group_id", "sex_id"]).agg({"population": "sum"}).reset_index()

In [39]:
24901036.30881269 - 23241084.916266542

1659951.392546147

In [None]:
# Old location_id = 44858
# New location_ids = 60908; 95069; 94364

In [None]:
future_fhs_population_path = "/mnt/share/forecasting/data/9/future/population/20250219_draining_fix_old_pop_v5/mean.nc"

Unnamed: 0,age_group_id,location_id,year_id,sex_id,population,population_aa,pop_fraction_aa
1474700,2,44858,2022,1,7792.544513,24217140.0,0.000322
