In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
from idd_forecast_mbp import constants as rfc
from idd_forecast_mbp.helper_functions import merge_dataframes, read_income_paths, read_urban_paths, level_filter
from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet

FORECASTING_DATA_PATH = rfc.MODEL_ROOT / "04-forecasting_data"
PROCESSED_DATA_PATH = rfc.MODEL_ROOT / "02-processed_data"
aa_full_population_df_path = f"{PROCESSED_DATA_PATH}/aa_2023_full_population_df.parquet"

hierarchy_df_path = f'{PROCESSED_DATA_PATH}/full_hierarchy_lsae_1209.parquet'
hierarchy_df = read_parquet_with_integer_ids(hierarchy_df_path)

dah_ref_df_path = f"{PROCESSED_DATA_PATH}/GK_dah_ref_df_2025_07_08.parquet"
dah_cut20_df_path = f"{PROCESSED_DATA_PATH}/GK_dah_cut20_df_2025_07_08.parquet"

In [2]:
DAH_GK_DIR = '/share/resource_tracking/forecasting/dah_channel_HFA/GK_2025_resubmission'

dah_df_path = 'dah_by_channel_hfa_recip_1990_2100_scenarios.csv'
dah_df = pd.read_csv(f'{DAH_GK_DIR}/{dah_df_path}')
dah_df = dah_df[(dah_df['hfa'] == 'mal') & (dah_df['year'] >= 2000)]
dah_df = dah_df.groupby(['year', 'recip']).agg({'dah_ref': 'sum', 'dah_cut20': 'sum'}).reset_index()
dah_ref_df = dah_df.rename(columns={'recip': 'iso3', 'dah_ref': 'mal_DAH_total', 'year': 'year_id'}).copy()
dah_cut20_df = dah_df.rename(columns={'recip': 'iso3', 'dah_cut20': 'mal_DAH_total', 'year': 'year_id'}).copy()

A0_hierarchy_df = hierarchy_df[hierarchy_df['level'] == 3].copy()
A0_hierarchy_df = A0_hierarchy_df[['location_id', 'location_name', 'ihme_loc_id']].drop_duplicates().reset_index(drop=True)
A0_hierarchy_df = A0_hierarchy_df.rename(columns={'ihme_loc_id': 'iso3'})

dah_ref_df = dah_ref_df.merge(A0_hierarchy_df, on='iso3', how='inner')
dah_cut20_df = dah_cut20_df.merge(A0_hierarchy_df, on='iso3', how='inner')
A0_location_filter = ('location_id', 'in', A0_hierarchy_df['location_id'].unique().tolist())
pop_df = read_parquet_with_integer_ids(aa_full_population_df_path, filters=[A0_location_filter])
dah_ref_df = dah_ref_df.merge(pop_df, on=['location_id', 'year_id'], how='left')
dah_cut20_df = dah_cut20_df.merge(pop_df, on=['location_id', 'year_id'], how='left')

dah_ref_df['mal_DAH_total_per_capita'] = dah_ref_df['mal_DAH_total'] / dah_ref_df['population']
dah_cut20_df['mal_DAH_total_per_capita'] = dah_cut20_df['mal_DAH_total'] / dah_cut20_df['population']

write_parquet(dah_ref_df,dah_ref_df_path)
write_parquet(dah_cut20_df,dah_cut20_df_path)

üóëÔ∏è Removed existing file: /mnt/team/idd/pub/forecast-mbp/02-processed_data/GK_dah_ref_df_2025_07_08.parquet
‚úÖ Metadata validation passed for /mnt/team/idd/pub/forecast-mbp/02-processed_data/GK_dah_ref_df_2025_07_08.parquet
üóëÔ∏è Removed existing file: /mnt/team/idd/pub/forecast-mbp/02-processed_data/GK_dah_cut20_df_2025_07_08.parquet
‚úÖ Metadata validation passed for /mnt/team/idd/pub/forecast-mbp/02-processed_data/GK_dah_cut20_df_2025_07_08.parquet


True

In [3]:
dah_ref_df

Unnamed: 0,year_id,iso3,mal_DAH_total,dah_cut20,location_id,location_name,population,mal_DAH_total_per_capita
0,2000,AFG,2.062655e+06,2.062655e+06,160,Afghanistan,1.835925e+07,0.112350
1,2000,AGO,6.571231e+05,6.571231e+05,168,Angola,1.524243e+07,0.043111
2,2000,ALB,6.167880e+04,6.167880e+04,43,Albania,3.213397e+06,0.019194
3,2000,ARG,5.348713e+05,5.348713e+05,97,Argentina,3.698223e+07,0.014463
4,2000,ARM,6.767346e+04,6.767346e+04,33,Armenia,3.341256e+06,0.020254
...,...,...,...,...,...,...,...,...
14223,2100,WSM,1.807737e+05,4.393519e+04,27,Samoa,6.027632e+05,0.299908
14224,2100,YEM,2.190201e+06,2.914000e+06,157,Yemen,5.149925e+07,0.042529
14225,2100,ZAF,1.312813e+07,7.133912e+06,196,South Africa,1.030486e+08,0.127397
14226,2100,ZMB,4.921065e+07,4.350870e+07,191,Zambia,5.506847e+07,0.893627


In [4]:
dah_cut20_df

Unnamed: 0,year_id,iso3,dah_ref,mal_DAH_total,location_id,location_name,population,mal_DAH_total_per_capita
0,2000,AFG,2.062655e+06,2.062655e+06,160,Afghanistan,1.835925e+07,0.112350
1,2000,AGO,6.571231e+05,6.571231e+05,168,Angola,1.524243e+07,0.043111
2,2000,ALB,6.167880e+04,6.167880e+04,43,Albania,3.213397e+06,0.019194
3,2000,ARG,5.348713e+05,5.348713e+05,97,Argentina,3.698223e+07,0.014463
4,2000,ARM,6.767346e+04,6.767346e+04,33,Armenia,3.341256e+06,0.020254
...,...,...,...,...,...,...,...,...
14223,2100,WSM,1.807737e+05,4.393519e+04,27,Samoa,6.027632e+05,0.072890
14224,2100,YEM,2.190201e+06,2.914000e+06,157,Yemen,5.149925e+07,0.056583
14225,2100,ZAF,1.312813e+07,7.133912e+06,196,South Africa,1.030486e+08,0.069229
14226,2100,ZMB,4.921065e+07,4.350870e+07,191,Zambia,5.506847e+07,0.790084


In [1]:
import pandas as pd
from pathlib import Path

# --- Configuration (Assumed) ---
DAH_GK_DIR = Path('/share/resource_tracking/forecasting/dah_channel_HFA/GK_2025_resubmission')
dah_df_path = 'dah_by_channel_hfa_recip_1990_2100_scenarios.csv'
FULL_INPUT_PATH = DAH_GK_DIR / dah_df_path

# --- Check Source Columns ---
print(f"--- Checking Source DAH Columns in: {FULL_INPUT_PATH.name} ---")

try:
    # 1. Load the data
    dah_source_df = pd.read_csv(FULL_INPUT_PATH)
    
    # 2. Apply filtering and grouping logic from the pipeline
    dah_source_df = dah_source_df[(dah_source_df['hfa'] == 'mal') & (dah_source_df['year'] >= 2000)]
    
    # 3. Group and aggregate the two critical columns
    dah_agg_df = dah_source_df.groupby(['year', 'recip']).agg({
        'dah_ref': 'sum', 
        'dah_cut20': 'sum'
    }).reset_index()
    
    # 4. Perform the comparison
    dah_ref_col = dah_agg_df['dah_ref']
    dah_cut20_col = dah_agg_df['dah_cut20']
    
    is_identical = dah_ref_col.equals(dah_cut20_col)
    
    print(f"Total rows after aggregation: {len(dah_agg_df)}")
    
    if is_identical:
        print("\n‚ùå CRITICAL FINDING: The 'dah_ref' and 'dah_cut20' columns in the source data are IDENTICAL.")
        print("This is the root cause of the previous file duplication.")
        print("The final output files (dah_ref_df_path and dah_cut20_df_path) WILL BE IDENTICAL.")
    else:
        # Check for differences
        diff_count = (dah_ref_col != dah_cut20_col).sum()
        print(f"\n‚úÖ The columns are DIFFERENT in {diff_count} out of {len(dah_agg_df)} rows.")
        print("The final output files should be different, meaning the pipeline source data is correct.")

except FileNotFoundError:
    print(f"Error: Input file not found at {FULL_INPUT_PATH}.")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")

--- Checking Source DAH Columns in: dah_by_channel_hfa_recip_1990_2100_scenarios.csv ---
Total rows after aggregation: 14329

‚úÖ The columns are DIFFERENT in 10650 out of 14329 rows.
The final output files should be different, meaning the pipeline source data is correct.
