In [2]:
import pandas as pd
import numpy as np, os
import matplotlib.pyplot as plt

from pathlib import Path
import yaml
import re

import gbd_mapping as gm
from vivarium import Artifact

from db_queries import get_ids, get_outputs, get_population, get_covariate_estimates
from get_draws.api import get_draws

import vivarium_helpers as vh
import vivarium_helpers.id_helper as idh
from vivarium_helpers.vph_output.operations import VPHOperator

!date
!whoami
!pwd

Thu Oct 30 06:36:40 PDT 2025
lutzes
/mnt/share/homes/lutzes/vivarium_research_alzheimers


# Load Needed Data

In [3]:
# Project directory
%cd /mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers


In [4]:
locations = [
    'United States of America',
    'Brazil',
    # 'China',
    # 'Germany',
    # 'Israel',
    # 'Japan',
    # 'Spain',
    # 'Sweden',
    # 'Taiwan (Province of China)',
    # 'United Kingdom',
]

# Define some shorter names to use for plotting
location_to_short_name = ({loc: loc for loc in locations}| {
    'Taiwan (Province of China)': 'Taiwan',
    'United Kingdom': 'UK',
    'United States of America': 'USA',
})

# # Select a subset of locations to draw plots for
# locations_to_plot = locations[:2]

project_dir = '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/'

model_number = '8.3' # Artifacts are stored here
run_subdirectories = [
    'results/abie_consistent_model_test/united_states_of_america/2025_10_28_08_55_05/',
]
run_dirs = [project_dir + run_subdir for run_subdir in run_subdirectories]
results_dirs = [run_dir + 'results/' for run_dir in run_dirs]

# # Option 1: One results directory per location
# location_to_results_dir = {
#     loc: path for loc, path in zip(locations, results_dirs)}

# Option 2: All locations in one results directory
location_to_results_dir = {'all': results_dirs[0]}

location_to_artifact_subdir = {loc: loc.lower().replace(' ', '_') for loc in locations}
artifact_subpaths = [f'artifacts/model{model_number}/' + subdir + '.hdf' for subdir in location_to_artifact_subdir.values()]
location_to_artifact_path = {loc: project_dir + subpath for loc, subpath in zip(locations, artifact_subpaths)}
artifact_path_to_location = {path: loc for loc, path in location_to_artifact_path.items()}
# artifact_path_to_location = {project_dir + subpath: loc for subpath, loc in zip(artifact_subpaths, locations)}
artifact_path_to_location

{'/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf': 'United States of America',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf': 'Brazil'}

# Get list of draws and draw columns from `keyspace.yaml`, and reduce to a subset of draws to save memory and time

In [5]:
with open(run_dirs[0] + 'keyspace.yaml', 'r') as keyspace_file:
    keyspace = yaml.safe_load(keyspace_file)
draws = keyspace['input_draw']
print(draws)

[457, 169, 323, 392, 346]


In [6]:
# I ended up never actually using this -- I just used all the draws
draws = sorted(draws[:10]) # reduce to a subset of draws to save memory, and sort
draw_cols = [f'draw_{i}' for i in draws]
print(draw_cols)

['draw_169', 'draw_323', 'draw_346', 'draw_392', 'draw_457']


# Load one artifact and define age bins

In [7]:
usa_artifact_path = location_to_artifact_path['United States of America']
usa_art = Artifact(usa_artifact_path)
# print(usa_art.load('metadata.locations'))
# print(usa_art)

In [8]:
# age_bins is an empty DataFrame with a MultiIndex storing age group data
age_bins = usa_art.load('population.age_bins')
age_dictionary = (
    age_bins
    .reset_index()
    .assign(age_group=lambda df: df['age_group_name'].str.replace(' ', '_'))
    # Filter to ages that actually appear in our sim
    .query("age_start >= 25")
)
#age_dictionary

In [9]:
scale = pd.DataFrame()
for location in locations:
    artifact_path = location_to_artifact_path[location]
    art = Artifact(artifact_path)
    temp = art.load('population.structure').reset_index() 
    temp['location'] = location
    
    df_prev_pop = pd.merge(
        art.load('population.scaling_factor').query("year_start == 2025"),
        art.load('population.structure').query("year_start==2025").droplevel(['year_start', 'year_end']),
        left_index=True,
        right_index=True,
        suffixes=['_prev', '_pop']
    )
    prev = ((df_prev_pop.filter(like='draw_').filter(like='_prev')
            * df_prev_pop.filter(like='draw_').filter(like='_pop').values).mean(axis=1)).sum(axis=0)
    # TODO: use draw-specific scale instead of mean
    
    ratio = 100_000 / prev
    print(ratio)

    temp['ratio'] = ratio

    temp = temp.rename(columns={'year_start': 'event_year'})
    temp = temp.merge(age_dictionary, on=['age_start','age_end'])
    mini = temp.loc[temp['event_year'] == 2050]
    for year in range(2051, 2100):
        temp = pd.concat([temp, mini.assign(event_year=year)], ignore_index=True)
    scale = pd.concat([scale, temp], ignore_index=True)
#scale.head()

0.02068399173333995
0.05127553537168728


# Create VPHOperator object to perform operations on simulation output

In [10]:
ops = VPHOperator()
ops.index_cols.append('location')
ops.index_cols

vh.vph_output.operations.INDEX_COLUMNS

['input_draw', 'scenario']

# Prevalence

In [11]:
location_to_results_dir.items()

dict_items([('all', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/abie_consistent_model_test/united_states_of_america/2025_10_28_08_55_05/results/')])

In [12]:
def load_sim_output(
        measure,
        results_dict=location_to_results_dir,
        artifact_path_to_location=artifact_path_to_location,
        drop_superfluous_cols=True, # drop redundant or empty columns
        force_categorical=True,
        aggregate_seeds=True,
        raw=False, # Overrides other parameters if True
        **kwargs, # keyword args to pass to .read_parquet
    ):
    """Load simulation output from .parquet files for all locations,
    optionally reducing the size of the data when possible. Returns
    concatenated outputs with a 'location' column added.
    """
    if raw:
        drop_superfluous_cols = False
        force_categorical = False
        aggregate_seeds = False

    dfs = []
    for location, directory in results_dict.items():
        df = pd.read_parquet(Path(directory) / f'{measure}.parquet', **kwargs)
        if drop_superfluous_cols:
            # Drop redundant columns
            for col1, col2 in [
                ('input_draw', 'input_draw_number'),
                ('entity', 'sub_entity'),
            ]:
                if (col1 in df and col2 in df and df[col1].equals(df[col2])):
                    df.drop(columns=col2, inplace=True)
            # Drop empty columns
            for col in df:
                if df[col].isna().all():
                    df.drop(columns=col, inplace=True)
        if force_categorical:
            convert_to_categorical(df, inplace=True)
        if aggregate_seeds:
            # Use default index and value columns when aggregating
            df = vh.vph_output.operations.marginalize(df, 'random_seed')
        # if location == 'all':
        if 'artifact_path' in df:
            df['location'] = df['artifact_path'].map(artifact_path_to_location)
        # else:
        #     df['location'] = location
        dfs.append(df)
    df = pd.concat(dfs)
    return df

# TODO: Consider making certain columns ordered Categoricals
def convert_to_categorical(df, inplace=False):
    """Convert all columns except float columns to categorical. This
    saves lots of memory, allowing us to load and manipulate larger
    DataFrames.
    """
    if not inplace:
        df = df.copy()
    for col in df:
        if df[col].dtype not in ('float', 'category'):
            df[col] = df[col].astype('category')
    if not inplace:
        return df
    else:
        return None

# NOTE: Differs from version in Vivarium Helpers in that here,
# dropna=False
def marginalize(
    df:pd.DataFrame,
    marginalized_cols,
    value_cols=None,
    reset_index=True,
    func='sum',
    args=(), # Positional args to pass to func in DataFrameGroupBy.agg
    **kwargs, # Keywords to pass to DataFrameGroupBy.agg
)->pd.DataFrame:
    if value_cols is None:
        value_cols = vh.vph_output.operations.value_col
    marginalized_cols = vh.utils._ensure_iterable(marginalized_cols)
    value_cols = vh.utils._ensure_iterable(value_cols)
    # Move Index levels into columns to enable passing index
    # level names as well as column names to marginalize
    df = vh.utils._ensure_columns_not_levels(df, marginalized_cols)
    groupby_cols = df.columns.difference(
        # must convert Index to list for groupby to work properly
        [*marginalized_cols, *value_cols]).to_list()
    aggregated_data = df.groupby(
        # observed=True needed for Categorical data
        groupby_cols, as_index=(not reset_index),
        observed=True, dropna=False,
    )[value_cols].agg(func, *args, **kwargs)
    return aggregated_data

def summarize_sim_data(df, age_dictionary=age_dictionary):
    """Summarize simulation data for plotting."""
    # Merge to get an age_start column for plotting
    if 'age_group' in df:
        df = df.merge(age_dictionary, on='age_group')
    # Summarize, and rename percentiles to match artifact
    summary = ops.describe(df).rename(
        columns={'2.5%': 'lower', '97.5%': 'upper'})
    return summary


In [13]:
def dataframe_beutification_and_summarizing(df, measure_name):

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    df_rate = df.copy()
    df_rate['value'] = df_rate['value'] / 100_000
    df_rate['Metric'] = 'Rate per 100,000'
    df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            'sub_entity':'Disease Stage'})
    df['Measure'] = measure_name
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = df['Disease Stage'].cat.rename_categories({
        'alzheimers_blood_based_biomarker_state': 'Preclinical AD',
        'alzheimers_mild_cognitive_impairment_state': 'MCI due to AD',
        'alzheimers_disease_state' : 'AD Dementia'
    })

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw']).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric']).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [14]:
prevalence = load_sim_output(
    'person_time_alzheimers_disease_and_other_dementias',
    )
prevalence.head()

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,sub_entity,treatment,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,person_time,baseline,Female,alzheimers_blood_based_biomarker_state,susceptible_to_treatment,0.0,
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,person_time,baseline,Female,alzheimers_blood_based_biomarker_state,waiting_for_treatment,0.0,
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,person_time,baseline,Female,alzheimers_blood_based_biomarker_state,full_effect_long,0.0,
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,person_time,baseline,Female,alzheimers_blood_based_biomarker_state,full_effect_short,0.0,
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,person_time,baseline,Female,alzheimers_blood_based_biomarker_state,waning_effect_long,0.0,


In [15]:
prevalence_final = dataframe_beutification_and_summarizing(prevalence, 'Prevalent person-time')

In [16]:
prevalence_final.loc[(prevalence_final['Year ID'] == 2025) & (prevalence_final['Age'] == '80_to_84') & (prevalence_final['Sex'] == 'Female') & (prevalence_final['Disease Stage'] == 'AD Dementia') & (prevalence_final['Scenario'] == 'Reference') & (prevalence_final['Metric'] == 'Number')]

## Looked at past model V&V and values for 80-84 females for clinical AD were ~300,000 in the USA, so this seems reasonable

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
402,2025,Brazil,80_to_84,Female,AD Dementia,Reference,Prevalent person-time,Number,94879.424603,91961.149083,97877.386633
942,2025,United States of America,80_to_84,Female,AD Dementia,Reference,Prevalent person-time,Number,246359.65303,230722.473692,258706.062536


In [17]:
# prevalence_final.to_csv('/ihme/homes/lutzes/vivarium_research_alzheimers/2025_10_28_prevalence_final.csv')

# Incidence 

In [18]:
incidence = load_sim_output(
    'transition_count_alzheimers_disease_and_other_dementias',
    )
incidence.head()

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,sub_entity,treatment,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,transition_count,baseline,Female,alzheimers_blood_based_biomarker_state_to_alzh...,susceptible_to_treatment,0.0,
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,transition_count,baseline,Female,alzheimers_blood_based_biomarker_state_to_alzh...,waiting_for_treatment,0.0,
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,transition_count,baseline,Female,alzheimers_blood_based_biomarker_state_to_alzh...,full_effect_long,0.0,
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,transition_count,baseline,Female,alzheimers_blood_based_biomarker_state_to_alzh...,full_effect_short,0.0,
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2025,392,transition_count,baseline,Female,alzheimers_blood_based_biomarker_state_to_alzh...,waning_effect_long,0.0,


In [19]:
bbbm_incidence = load_sim_output(
    'counts_new_simulants',
    )
bbbm_incidence.head()

Unnamed: 0,age_group,artifact_path,event_year,input_draw,scenario,sex,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,2025,392,baseline,Female,0.0,
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,2025,392,baseline,Male,0.0,
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,2025,392,bbbm_testing,Female,0.0,
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,2025,392,bbbm_testing,Male,0.0,
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,2025,392,bbbm_testing_and_treatment,Female,0.0,


In [20]:
incidence.sub_entity.unique()

['alzheimers_blood_based_biomarker_state_to_alz..., 'alzheimers_mild_cognitive_impairment_state_to...]
Categories (2, object): ['alzheimers_blood_based_biomarker_state_to_alz..., 'alzheimers_mild_cognitive_impairment_state_to...]

In [21]:
def dataframe_beutification_and_summarizing(df, measure_name):

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    df_rate = df.copy()
    df_rate['value'] = df_rate['value'] / 100_000
    df_rate['Metric'] = 'Rate per 100,000'
    df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            'sub_entity':'Disease Stage'})
    df['Measure'] = measure_name
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = df['Disease Stage'].cat.rename_categories({
        'alzheimers_blood_based_biomarker_state_to_alzheimers_mild_cognitive_impairment_state': 'MCI due to AD',
        'alzheimers_mild_cognitive_impairment_state_to_alzheimers_disease_state': 'AD Dementia'
    })

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw']).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric']).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [22]:
incidence_final = dataframe_beutification_and_summarizing(incidence, 'Incident cases')

In [23]:
incidence_final.loc[(incidence_final['Year ID'] == 2025) & (incidence_final['Age'] == '80_to_84') & (incidence_final['Sex'] == 'Female') & (incidence_final['Disease Stage'] == 'AD Dementia') & (incidence_final['Scenario'] == 'Reference') & (incidence_final['Metric'] == 'Number')]

## Again, compared to old V&V and found that the incidence should be ~50,000 which looks about right

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
270,2025,Brazil,80_to_84,Female,AD Dementia,Reference,Incident cases,Number,15547.375453,14277.764136,17400.110862
630,2025,United States of America,80_to_84,Female,AD Dementia,Reference,Incident cases,Number,45822.876562,38546.718171,52441.521636


In [24]:
def bbbm_inc_dataframe_beutification_and_summarizing(df, measure_name):

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    df_rate = df.copy()
    df_rate['value'] = df_rate['value'] / 100_000
    df_rate['Metric'] = 'Rate per 100,000'
    df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario'
                            })
    df['Measure'] = measure_name
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] =  'Preclinical AD'

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw']).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric']).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [25]:
bbbm_incidence_final = bbbm_inc_dataframe_beutification_and_summarizing(bbbm_incidence, 'Incident cases')
#bbbm_incidence_final.head()

In [26]:
joined_incidence = pd.concat([incidence_final, bbbm_incidence_final], ignore_index=True)
joined_incidence

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Incident cases,Number,0.0,0.0,0.0
1,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Incident cases,"Rate per 100,000",0.0,0.0,0.0
2,2025,Brazil,25_to_29,Female,MCI due to AD,BBBM Testing Only,Incident cases,Number,0.0,0.0,0.0
3,2025,Brazil,25_to_29,Female,MCI due to AD,BBBM Testing Only,Incident cases,"Rate per 100,000",0.0,0.0,0.0
4,2025,Brazil,25_to_29,Female,MCI due to AD,BBBM Testing and Treatment,Incident cases,Number,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
38875,2060,United States of America,95_plus,Male,Preclinical AD,Reference,Incident cases,"Rate per 100,000",0.0,0.0,0.0
38876,2060,United States of America,95_plus,Male,Preclinical AD,BBBM Testing Only,Incident cases,Number,0.0,0.0,0.0
38877,2060,United States of America,95_plus,Male,Preclinical AD,BBBM Testing Only,Incident cases,"Rate per 100,000",0.0,0.0,0.0
38878,2060,United States of America,95_plus,Male,Preclinical AD,BBBM Testing and Treatment,Incident cases,Number,0.0,0.0,0.0


In [27]:
joined_incidence.to_csv('/ihme/homes/lutzes/vivarium_research_alzheimers/2025_10_28_incidence_final.csv')

# AD Deaths

In [28]:
deaths = load_sim_output(
    'deaths',
    )
deaths.head()

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2025,392,deaths,baseline,Female,0.0,
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2025,392,deaths,baseline,Male,0.0,
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2025,392,deaths,bbbm_testing,Female,0.0,
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2025,392,deaths,bbbm_testing,Male,0.0,
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2025,392,deaths,bbbm_testing_and_treatment,Female,0.0,


In [29]:
def dataframe_beutification_and_summarizing(df, measure_name):

    df = df.loc[df.entity == 'alzheimers_disease_state']
    df['Measure'] = measure_name

    # Add code to make a new measure called 'Averted Deaths' by subtracting from baseline
    df_baseline = df.loc[df['scenario'] == 'baseline']
    df_baseline = df_baseline.rename(columns={'value':'baseline_value'})

    df_averted = df.copy().rename(columns={'value':'all_value'})
    df_averted = df_averted.merge(
        df_baseline[['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','entity','baseline_value']],
        on=['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','entity'])
    df_averted['value'] = df_averted['baseline_value'] - df_averted['all_value']
    df_averted['Measure'] = 'Averted Deaths'
    df = pd.concat([df, df_averted], ignore_index=True)

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    df_rate = df.copy()
    df_rate['value'] = df_rate['value'] / 100_000
    df_rate['Metric'] = 'Rate per 100,000'
    df = pd.concat([df, df_rate], ignore_index=True)


    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            'entity':'Disease Stage'})
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = 'AD Dementia'

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw']).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric']).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [30]:
deaths_final = dataframe_beutification_and_summarizing(deaths, 'AD Deaths')

In [31]:
deaths_final.loc[(deaths_final['Year ID'] == 2055) & (deaths_final['Age'] == '80_to_84') & (deaths_final['Sex'] == 'Female') & (deaths_final['Disease Stage'] == 'AD Dementia') & (deaths_final['Metric'] == 'Number')]

## Compared to GBD Compare for this actually. They showed in 2023 about 24k deaths, but that is for Alz and other dementias combined. So this seems reasonable at about half that value. 

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
21864,2055,Brazil,80_to_84,Female,AD Dementia,Reference,AD Deaths,Number,41817.213306,35098.609638,46739.63875
21866,2055,Brazil,80_to_84,Female,AD Dementia,Reference,Averted Deaths,Number,0.0,0.0,0.0
21868,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,AD Deaths,Number,41817.213306,35098.609638,46739.63875
21870,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,Averted Deaths,Number,0.0,0.0,0.0
21872,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,AD Deaths,Number,41080.019638,34263.90358,46012.196321
21874,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,Averted Deaths,Number,737.193668,462.208728,856.158784
22224,2055,United States of America,80_to_84,Female,AD Dementia,Reference,AD Deaths,Number,62434.757113,59935.23958,65422.574977
22226,2055,United States of America,80_to_84,Female,AD Dementia,Reference,Averted Deaths,Number,0.0,0.0,0.0
22228,2055,United States of America,80_to_84,Female,AD Dementia,BBBM Testing Only,AD Deaths,Number,62434.757113,59935.23958,65422.574977
22230,2055,United States of America,80_to_84,Female,AD Dementia,BBBM Testing Only,Averted Deaths,Number,0.0,0.0,0.0


In [32]:
deaths_final.to_csv('/ihme/homes/lutzes/vivarium_research_alzheimers/2025_10_28_deaths_final.csv')

In [33]:
## Add in deaths averted here!! 

# DALYs 

In [34]:
ylls = load_sim_output(
    'ylls',
    )
ylls = ylls.rename(columns={'value':'ylls'}).drop(columns=['measure', 'entity'])
ylls['sub_entity'] = 'alzheimers_disease_state'

ylds = load_sim_output(
    'ylds',
    )
ylds = ylds.rename(columns={'value':'ylds'}).drop(columns=['measure','entity'])

dalys = ylds.merge(ylls, on=['artifact_path', 'entity_type', 'age_group', 'event_year', 'location', 'sex', 'scenario', 'input_draw', 'sub_entity','entity_type'], how='outer')

convert_to_categorical(dalys, inplace=True)
dalys['ylds'] = dalys['ylds'].fillna(0)
dalys['ylls'] = dalys['ylls'].fillna(0)
dalys['value'] = dalys['ylls'] + dalys['ylds']
dalys = dalys.drop(columns=['ylls','ylds'])
dalys.tail()

Unnamed: 0,age_group,artifact_path,entity_type,event_year,input_draw,scenario,sex,sub_entity,location,value
2105995,95_plus,/mnt/team/simulation_science/pub/models/vivari...,cause,2060,346,baseline,Male,all_causes,,90.355759
2105996,95_plus,/mnt/team/simulation_science/pub/models/vivari...,cause,2060,346,bbbm_testing,Female,all_causes,,990.845538
2105997,95_plus,/mnt/team/simulation_science/pub/models/vivari...,cause,2060,346,bbbm_testing,Male,all_causes,,90.355759
2105998,95_plus,/mnt/team/simulation_science/pub/models/vivari...,cause,2060,346,bbbm_testing_and_treatment,Female,all_causes,,993.645183
2105999,95_plus,/mnt/team/simulation_science/pub/models/vivari...,cause,2060,346,bbbm_testing_and_treatment,Male,all_causes,,91.327327


In [45]:
df = dalys.copy()
df = df.loc[df.sub_entity.isin(['alzheimers_disease_state','alzheimers_blood_based_biomarker_state','alzheimers_mild_cognitive_impairment_state'])]
df['sub_entity'] = df['sub_entity'].cat.remove_unused_categories()

df_baseline = df.loc[df['scenario'] == 'baseline']
df_baseline = df_baseline.rename(columns={'value':'baseline_value'})

df_averted = df.copy().rename(columns={'value':'all_value'})
df_averted = df_averted.merge(
    df_baseline[['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','sub_entity','baseline_value']],
    on=['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','sub_entity'])
df_averted['value'] = df_averted['baseline_value'] - df_averted['all_value']
df_averted['Measure'] = 'Averted DALYs'
df = pd.concat([df, df_averted], ignore_index=True)

df.sub_entity.unique()

['alzheimers_blood_based_biomarker_state', 'alzheimers_mild_cognitive_impairment_state', 'alzheimers_disease_state']
Categories (3, object): ['alzheimers_blood_based_biomarker_state', 'alzheimers_disease_state', 'alzheimers_mild_cognitive_impairment_state']

In [46]:
def dataframe_beutification_and_summarizing(df, measure_name):

    df = df.loc[df.sub_entity.isin(['alzheimers_disease_state','alzheimers_blood_based_biomarker_state','alzheimers_mild_cognitive_impairment_state'])]
    df['sub_entity'] = df['sub_entity'].cat.remove_unused_categories()
    df['Measure'] = measure_name

    # Add code to make a new measure called 'Averted DALYs' by subtracting from baseline
    df_baseline = df.loc[df['scenario'] == 'baseline']
    df_baseline = df_baseline.rename(columns={'value':'baseline_value'})

    df_averted = df.copy().rename(columns={'value':'all_value'})
    df_averted = df_averted.merge(
        df_baseline[['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','sub_entity','baseline_value']],
        on=['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','sub_entity'])
    df_averted['value'] = df_averted['baseline_value'] - df_averted['all_value']
    df_averted['Measure'] = 'Averted DALYs'
    df = pd.concat([df, df_averted], ignore_index=True)

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    df_rate = df.copy()
    df_rate['value'] = df_rate['value'] / 100_000
    df_rate['Metric'] = 'Rate per 100,000'
    df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            'sub_entity':'Disease Stage'})
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = df['Disease Stage'].cat.rename_categories({
        'alzheimers_blood_based_biomarker_state': 'Preclinical AD',
        'alzheimers_mild_cognitive_impairment_state': 'MCI due to AD',
        'alzheimers_disease_state' : 'AD Dementia'
    })

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw']).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric']).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [47]:
dalys_final = dataframe_beutification_and_summarizing(dalys, 'AD DALYs')

In [48]:
dalys_final

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,AD DALYs,Number,0.000000,0.000000,0.000000
1,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,AD DALYs,"Rate per 100,000",0.000000,0.000000,0.000000
2,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,Averted DALYs,Number,0.000000,0.000000,0.000000
3,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,Averted DALYs,"Rate per 100,000",0.000000,0.000000,0.000000
4,2025,Brazil,25_to_29,Female,Preclinical AD,BBBM Testing Only,AD DALYs,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
77755,2060,United States of America,95_plus,Male,MCI due to AD,BBBM Testing Only,Averted DALYs,"Rate per 100,000",0.000000,0.000000,0.000000
77756,2060,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,AD DALYs,Number,375.875149,239.357424,526.838567
77757,2060,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,AD DALYs,"Rate per 100,000",0.003759,0.002394,0.005268
77758,2060,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted DALYs,Number,-4.658549,-12.524787,-0.359768


In [None]:
dalys_final.loc[(dalys_final['Year ID'] == 2025) & (dalys_final['Age'] == '80_to_84') & (dalys_final['Sex'] == 'Female') & (dalys_final['Disease Stage'] == 'AD Dementia') & (dalys_final['Metric'] == 'Number')]

## GBD compare for DALYs shows the rate as ~420k, which is less than we are seeing here. That's very strange and seems wrong. Need to investigate further.

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
804,2025,Brazil,80_to_84,Female,AD Dementia,Reference,AD DALYs,Number,331505.059779,304977.984746,342966.206548
806,2025,Brazil,80_to_84,Female,AD Dementia,Reference,Averted DALYs,Number,0.0,0.0,0.0
808,2025,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,AD DALYs,Number,331505.059779,304977.984746,342966.206548
810,2025,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,Averted DALYs,Number,0.0,0.0,0.0
812,2025,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,AD DALYs,Number,331505.059779,304977.984746,342966.206548
814,2025,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,Averted DALYs,Number,0.0,0.0,0.0
1884,2025,United States of America,80_to_84,Female,AD Dementia,Reference,AD DALYs,Number,888649.818066,833526.053885,945614.713001
1886,2025,United States of America,80_to_84,Female,AD Dementia,Reference,Averted DALYs,Number,0.0,0.0,0.0
1888,2025,United States of America,80_to_84,Female,AD Dementia,BBBM Testing Only,AD DALYs,Number,888649.818066,833526.053885,945614.713001
1890,2025,United States of America,80_to_84,Female,AD Dementia,BBBM Testing Only,Averted DALYs,Number,0.0,0.0,0.0


In [40]:
dalys_final.to_csv('/ihme/homes/lutzes/vivarium_research_alzheimers/2025_10_28_dalys_final.csv')

In [41]:
# Need to add averted DALYs as well 

In [42]:
ls /mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model7.4/united_states_of_america/2025_10_24_16_02_54/results/

counts_baseline_tests_among_eligible.parquet
counts_bbbm_tests.parquet
counts_new_simulants.parquet
counts_newly_eligible_for_bbbm_testing.parquet
deaths.parquet
person_time_alzheimers_disease_and_other_dementias.parquet
person_time_eligible_for_bbbm_testing.parquet
person_time_ever_eligible_for_bbbm_testing.parquet
person_time_treatment.parquet
transition_count_alzheimers_disease_and_other_dementias.parquet
transition_count_treatment.parquet
ylds.parquet
ylls.parquet
