In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np, os
import matplotlib.pyplot as plt

from pathlib import Path
import yaml
import re
import datetime

import pyarrow as pa
import pyarrow.parquet as pq

import gbd_mapping as gm
from vivarium import Artifact

from db_queries import get_ids, get_outputs, get_population, get_covariate_estimates
from get_draws.api import get_draws

import vivarium_helpers as vh
import vivarium_helpers.id_helper as idh
from vivarium_helpers.vph_output.operations import VPHOperator
from vivarium_helpers.vph_output.measures import VPHResults
from vivarium_helpers.utils import convert_to_categorical, constant_categorical, print_memory_usage

!date
!whoami
!pwd

Sat Nov  1 00:37:54 PDT 2025
ndbs
/mnt/share/code/ndbs/vivarium_research_alzheimers/results_tables


# Find data

Results directories of final model 8.4 runs, in batches of 3 or 4 draws:

Batch 0 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18

Batch 1 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39

Batch 2 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_45_13

Batch 3 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_14_03_51

Batch 4 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_16_32_03

Batch 5 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_17_25_38

Batch 6 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_03_40

Batch 7 results:
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_09_31a



In [3]:
# Each batch run contains all locations, 100 random seeds, and 3 or 4 draws
batch_run_dirs = [
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_45_13', # 4 draws
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_14_03_51',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_16_32_03',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_17_25_38',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_03_40',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_09_31',
]

In [4]:
# Project directory
%cd /mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers


In [5]:
# Results directory containing model 8.3 results for all locations
model_run_subdir = 'results/abie_consistent_model_test/united_states_of_america/2025_10_28_08_55_05/'
!ls -halt $model_run_subdir/results

total 60M
drwxrwsr-x 5 abie IHME-Simulationscience 4.5K Oct 28 09:19 ..
drwxrwsr-x 2 abie IHME-Simulationscience 6.5K Oct 28 09:17 .
-rw-rw-r-- 1 abie IHME-Simulationscience  17M Oct 28 09:17 ylds.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 8.2M Oct 28 09:17 ylls.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 5.2M Oct 28 09:17 person_time_treatment.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 512K Oct 28 09:17 counts_newly_eligible_for_bbbm_testing.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 794K Oct 28 09:17 person_time_eligible_for_bbbm_testing.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 1.4M Oct 28 09:17 deaths.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 4.1M Oct 28 09:17 person_time_ever_eligible_for_bbbm_testing.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 841K Oct 28 09:17 counts_new_simulants.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 406K Oct 28 09:17 counts_bbbm_tests.parquet
-rw-rw-r-- 1 abie IHME-Simulationscience 2.4M Oct 28 09:17 counts_

In [6]:
!ls artifacts

2		  model1.0  model3.0  model4.1	model4.4  model6.0
basic_model	  model2.0  model3.1  model4.2	model4.5  model7.0
consistent-rates  model2.2  model4.0  model4.3	model5.0  model8.3


In [7]:
# This is where results will eventually be for the 8.4 final runs
!ls results/model8.4/model_spec

2025_10_29_20_39_18  2025_10_30_14_03_51  2025_10_31_01_03_40
2025_10_29_20_41_39  2025_10_30_16_32_03  2025_10_31_01_09_31
2025_10_29_20_45_13  2025_10_30_17_25_38


# Define directories

Output directory:

`J:\Project\simulation_science\alzheimers\results_10_31_2025`

In [8]:
output_dir = Path(r"J:\Project\simulation_science\alzheimers\results_10_31_2025".replace('\\', '/').replace('J:', '/snfs1'))
print(output_dir.exists())
output_dir

True


PosixPath('/snfs1/Project/simulation_science/alzheimers/results_10_31_2025')

In [9]:
locations = [
    'United States of America',
    'Brazil',
    'China',
    'Germany',
    'Israel',
    'Japan',
    'Spain',
    'Sweden',
    'Taiwan (Province of China)',
    'United Kingdom',
]

# Define some shorter names to use for plotting
location_to_short_name = ({loc: loc for loc in locations}| {
    'Taiwan (Province of China)': 'Taiwan',
    'United Kingdom': 'UK',
    'United States of America': 'USA',
})

# Select a subset of locations to draw plots for
locations_to_plot = locations[:2]

project_dir = '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/'

artifact_model_number = '8.3' # Artifacts are stored here
# run_subdirectories = [
#     model_run_subdir,
# ]
# run_dirs = [project_dir + run_subdir for run_subdir in run_subdirectories]
# results_dirs = [run_dir + 'results/' for run_dir in run_dirs]

# Results directory for model 8.3, for testing
results_dirs = project_dir + model_run_subdir + 'results/'

def get_results_and_artifact_dicts(
        locations, results_dirs, artifact_model_number, project_dir):

    match results_dirs:
        case str() | Path():
            # Option 1: All locations concatenated in one results
            # directory
            location_to_results_dir = {'all': results_dirs}
        case list():
            # Option 2: One results directory per location
            location_to_results_dir = {
                loc: path for loc, path in zip(locations, results_dirs)}

    location_to_artifact_subdir = {
        loc: loc.lower().replace(' ', '_') for loc in locations}
    artifact_subpaths = [
        f'artifacts/model{artifact_model_number}/' + subdir + '.hdf' 
        for subdir in location_to_artifact_subdir.values()]

    location_to_artifact_path = {
        loc: project_dir + subpath for loc, subpath
        in zip(locations, artifact_subpaths)}

    return location_to_results_dir, location_to_artifact_path

location_to_results_dir, location_to_artifact_path = get_results_and_artifact_dicts(
    locations, results_dirs, artifact_model_number, project_dir
)
location_to_artifact_path

{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf',
 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf',
 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf',
 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf',
 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf',
 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf',
 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf',
 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf',
 'Taiwan (Province of China)': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifact

In [10]:
location_to_results_dir

{'all': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/abie_consistent_model_test/united_states_of_america/2025_10_28_08_55_05/results/'}

# Define some ordered Categorical dtypes, and convert years to ints

In [11]:
# Order locations lexicographically
all_locations = [
    'United States of America',
    'Brazil',
    'China',
    'Germany',
    'Israel',
    'Japan',
    'Spain',
    'Sweden',
    'Taiwan (Province of China)',
    'United Kingdom',
]
all_locations_dtype = pd.CategoricalDtype(sorted(all_locations), ordered=True)

# int16 ranges from -32768 to 32767 (I think), which is sufficient to
# represent all years 2025-2100. uint8 only goes from 0 to 255, which is
# too small.
year_dtype = 'int16'

# Store draws as ints instead of categoricals since we'll be
# concatenating different draws from different results directories
input_draw_dtype = 'int16'

# Order age groups chronologically
age_groups = [f'{age}_to_{age + 4}' for age in range(25, 95, 5)] + ['95_plus']
age_group_dtype = pd.CategoricalDtype(age_groups, ordered=True)

# Order scenarios by complexity
scenarios = ['baseline', 'bbbm_testing', 'bbbm_testing_and_treatment']
scenario_dtype = pd.CategoricalDtype(scenarios, ordered=True)

colname_to_dtype = {
    'location': all_locations_dtype,
    'event_year': year_dtype,
    'age_group': age_group_dtype,
    'scenario': scenario_dtype,
    'input_draw': input_draw_dtype,
}

# Load one artifact to define age bins

In [12]:
artifact_path = location_to_artifact_path[locations[0]]
art = Artifact(artifact_path)
print(art.load('metadata.locations'))

# age_bins is an empty DataFrame with a MultiIndex storing age group data
age_bins = art.load('population.age_bins')
age_map = (
    age_bins
    .reset_index()
    .assign(age_group=lambda df: df['age_group_name'].str.replace(' ', '_'))
    # Filter to ages that actually appear in our sim
    .query("age_start >= 25")
)
age_map.tail()

['United States of America']


Unnamed: 0,age_group_id,age_group_name,age_start,age_end,age_group
14,20,75 to 79,75.0,80.0,75_to_79
15,30,80 to 84,80.0,85.0,80_to_84
16,31,85 to 89,85.0,90.0,85_to_89
17,32,90 to 94,90.0,95.0,90_to_94
18,235,95 plus,95.0,125.0,95_plus


# Define functions to load and merge Artifact data from all locations

In [13]:
def load_artifact_data(
    key,
    filter_terms=None,
    location_to_artifact_path=location_to_artifact_path,
):
    dfs = {} # dict to map locations to artifact data
    for location, path in location_to_artifact_path.items():
        art = Artifact(path, filter_terms)
        # Check to make sure location matches artifact
        art_locations = art.load('metadata.locations')
        assert len(art_locations) == 1 and art_locations[0] == location, \
            f'Unexpected locations in artifact: {location=}, {art_locations=}'
        df = art.load(key)
        dfs[location] = df
    if all('location' in df.index.names for df in dfs.values()):
        data = pd.concat(dfs.values())
    else:
        data = pd.concat(dfs, names=['location', *df.index.names])
    return data

# Define functions to load simulation results

In [14]:
# Create an operator object - treat each random seed as a separate draw,
# and add location to the index
ops = VPHOperator(location_col=True)
# ops.index_cols.extend(['location', 'random_seed'])

def load_sim_output(
        measure,
        results_dict=location_to_results_dir,
        # Pass None to skip filtering locations (when None, must also
        # pass assign_location=False or raw=True)
        location_to_artifact_path=location_to_artifact_path,
        # specify dtypes of certain columns
        colname_to_dtype=colname_to_dtype,
        drop_superfluous_cols=True, # drop redundant or empty columns
        # Sets the 'read_dictionary' key of kwargs, which is passed to
        # pyarrow.parquet.read_table()
        force_parquet_dictionaries=True,
        force_pandas_categoricals=True,
        aggregate_seeds=True,
        assign_location=True,
        raw=False, # Overrides other parameters if True
        **kwargs, # keyword args to pass to .read_parquet
    ):
    """Load simulation output from .parquet files for all locations,
    optionally reducing the size of the data when possible. Returns
    concatenated outputs with a 'location' column added.
    """
    # Override optional transformations if raw=True
    if raw:
        drop_superfluous_cols = False
        force_parquet_dictionaries = False
        force_pandas_categoricals = False
        aggregate_seeds = False
        assign_location = False

    # Determine whether results for all locations are stored in same
    # directory, or if different locations have different results
    # directories
    match location_to_results_dir:
        case {'all': _}:
            all_locations_together = True
        case _:
            all_locations_together = False
    
    if all_locations_together and assign_location and location_to_artifact_path is None:
        raise ValueError(
            "Must provide mapping of artifacts to locations  when" \
            " assign_location=True and all locations are concatenated" \
            " in the simulation outputs."
        )

    dfs = []
    for location, directory in results_dict.items():

        parquet_file_path = Path(directory) / f'{measure}.parquet'
        # Read the Parquet file's schema to get column names and data types
        parquet_schema = pq.read_schema(parquet_file_path)

        if (
            all_locations_together
            and location_to_artifact_path is not None
        ):
            if 'artifact_path' in parquet_schema.names:
                # Filter to locations in list
                location_filter = (
                    'artifact_path',
                    'in',
                    list(location_to_artifact_path.values()),
                )
                user_filters = kwargs.get('filters') # Defaults to None
                kwargs['filters'] = add_parquet_AND_filter(
                    location_filter, user_filters)
                # TODO: Use logging not printing
                print(location_filter)
            else:
                print("'artifact_path' column missing from parquet file."
                      " Not filtering locations.")

        if force_parquet_dictionaries:
            # Read all columns as dictionaries except those containing 
            # floating point values
            kwargs['read_dictionary'] = [
                col.name for col in parquet_schema
                if not pa.types.is_floating(col.type)]

        # Read the parquet file
        df = pd.read_parquet(parquet_file_path, **kwargs)
        print_memory_usage(df, 'after read_parquet')

        if drop_superfluous_cols:
            # Drop redundant columns
            for col1, col2 in [
                ('input_draw', 'input_draw_number'),
                ('entity', 'sub_entity'),
            ]:
                if (col1 in df and col2 in df and df[col1].equals(df[col2])):
                    df.drop(columns=col2, inplace=True)
            # Drop empty columns (e.g., sub-entity)
            for col in df:
                if df[col].isna().all():
                    df.drop(columns=col, inplace=True)
        if colname_to_dtype is not None:
            # Filter to avoid KeyError
            colname_to_dtype = {c: dtype for c, dtype
                                in colname_to_dtype.items() if c in df}
            # NOTE: If copy-on-write is enabled, copy keyword is ignored
            df = df.astype(colname_to_dtype, copy=False)
        if force_pandas_categoricals:
            convert_to_categorical(
                df, exclude_cols=colname_to_dtype or (), inplace=True)
        if aggregate_seeds:
            # Use default index and value columns when aggregating
            df = vh.vph_output.operations.marginalize(df, 'random_seed')
        if assign_location:
            if all_locations_together:
                # NOTE: location_to_artifact_path is guaranteed not to
                # be None because assign_location and
                # all_locations_together are both True

                # Find or create a Categorical dtype with all locations
                location_dtype = colname_to_dtype.get(
                    'location',
                    pd.CategoricalDtype(
                        sorted(location_to_artifact_path.keys()), ordered=True)
                )
                # Invert the dictionary so we can map artifact paths to
                # locations
                artifact_path_to_location = {
                    path: loc for loc, path
                    in location_to_artifact_path.items()}
                if 'artifact_path' in df:
                    df['location'] = df['artifact_path'].map(
                        artifact_path_to_location).astype(location_dtype)
                else:
                    # In case the engineers change the DataFrame format
                    # on us...
                    print("'artifact_path' column missing from DataFrame."
                          " Not assigning locations.")
            else:
                # NOTE: location_to_results_dir contains actual
                # locations as keys (not 'all') since
                # all_locations_together is False

                # Find or create a Categorical dtype with all locations
                # to avoid converting back to object dtype.
                location_dtype = colname_to_dtype.get(
                    'location',
                    pd.CategoricalDtype(
                        sorted(location_to_results_dir.keys()), ordered=True)
                )
                df['location'] = location
                df['location'] = df['location'].astype(location_dtype)
        dfs.append(df)
    # TODO: Maybe if assign_location is False and all_locations_together
    # is also False (and there is more than one location?), we should
    # return a dict mapping locations to dataframes (or just a list of
    # dataframes?) instead of concatenating, since it won't be possible
    # to filter the resulting concatenated dataframe by location...
    df = pd.concat(dfs, ignore_index=True)
    return df
    
def add_parquet_AND_filter(new_filter, existing_filters):
    match existing_filters:
        case None:
            # No existing filters -- create a single AND group
            filters = [new_filter]
        case list([tuple((_, _, _)), *_]):
            # Existing filters consist of one AND group -- add the new filter
            filters = [new_filter, *existing_filters]
        case list([list([tuple((_, _, _)), *_]), *_]):
            # Add the filter to each AND group in the outer OR group
            filters = [[new_filter, *and_group] for and_group in existing_filters]
        case _:
            raise ValueError(f"Malformed parquet filter: {existing_filters}")
    return filters

def current_time():
    print(datetime.datetime.now())

## Function to load and concatenate runs from multiple batches

It's possible that loading all locations at once from a single batch may
use too much memory, so we may have to load locations in groups from
each batch before we can aggregate random seeds to make the data smaller.

In [15]:
# Test grouping locations into n groups
# This seems to work for any n and splits as evenly as possible
n = 4
for i in range(n):
    print(locations[i::n])

['United States of America', 'Israel', 'Taiwan (Province of China)']
['Brazil', 'Japan', 'United Kingdom']
['China', 'Spain']
['Germany', 'Sweden']


In [16]:
def load_measure_from_batch_runs(
        measure,
        batch_run_dirs,
        locations=locations,
        n_location_groups=1,
        colname_to_dtype=colname_to_dtype,
        project_dir=project_dir,
        **kwargs
    ):
    """Load data from multiple batch runs, aggregate random seeds, and
    concatenate.
    """
    # aggregate seeds by default, and warn if False was passed
    if not kwargs.setdefault('aggregate_seeds', True):
        # Documentation for setdefault: If key is in the dictionary,
        # return its value. If not, insert key with a value of default
        # and return default.
        print("Warning: Not aggregating seeds, which may require lots of memory")
    dfs = []
    for run_dir in batch_run_dirs:
        print(run_dir)
        results_dir = Path(run_dir) / 'results'
        for i in range(n_location_groups):
            location_group = locations[i::n_location_groups]
            # print(location_group)
            location_to_results_dir, location_to_artifact_path = get_results_and_artifact_dicts(
                location_group, results_dir, artifact_model_number, project_dir
            )
            print(location_to_artifact_path)
            df = load_sim_output(
                measure, location_to_results_dir, location_to_artifact_path, colname_to_dtype, **kwargs
            )
            print_memory_usage(df, 'after aggregating seeds and converting dtypes')
            dfs.append(df)
    measure_df = pd.concat(dfs, ignore_index=True)
    print_memory_usage(measure_df, 'total')
    measure_df = measure_df.astype(colname_to_dtype)
    print_memory_usage(measure_df, 'after enforcing dtypes')
    return measure_df


In [17]:
batch_run_dirs[1:3]

['/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_45_13']

In [18]:
# Stats for reading deaths.parquet with no extra filters:
# 28 seconds to load 2 batches with 1 location group, 20 MB after concatenation,
# max 1877 MB before aggregating seeds
# 31 seconds to load 2 batches with 3 location groups, 86 MB after concatenation,
# max 751 MB before aggregating seeds
test = load_measure_from_batch_runs(
    'deaths', batch_run_dirs[1:3], locations, n_location_groups=1
)
test.dtypes

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf',

age_group        category
artifact_path    category
entity           category
entity_type      category
event_year          int16
input_draw          int16
measure          category
scenario         category
sex              category
value             float64
location         category
dtype: object

# Calculate model scale

## First read population structure and initial all-state prevalences from the artifact

In [19]:
# This is the number of people in each demographic group in each year --
# these numbers come from the FHS population forecasts
pop_structure = load_artifact_data('population.structure')
pop_structure.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,draw_0,draw_1,draw_2,draw_3,draw_4,draw_5,draw_6,draw_7,draw_8,draw_9,...,draw_490,draw_491,draw_492,draw_493,draw_494,draw_495,draw_496,draw_497,draw_498,draw_499
location,sex,age_start,age_end,year_start,year_end,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
United Kingdom,Male,95.0,125.0,2046,2047,129365.649457,130775.330848,132470.262731,122141.117458,122669.816649,116553.291993,137240.957664,146303.198839,134355.643238,129130.125638,...,129199.724763,133834.848564,128153.198273,126410.768209,108873.787649,129171.958392,132812.650873,145683.572345,153951.857746,133969.822489
United Kingdom,Male,95.0,125.0,2047,2048,129176.872428,131427.429524,133135.096509,122206.362659,121990.475534,116216.514579,137226.41911,147381.840121,134405.091989,128844.029807,...,128743.759762,134732.739968,129357.417043,127365.126985,108980.704141,128693.703184,133430.947564,146604.135918,154047.33539,134464.260699
United Kingdom,Male,95.0,125.0,2048,2049,128215.39423,131705.587524,132988.854403,121848.151327,120928.626569,115242.163772,136633.081048,147552.13904,133626.750369,127966.248102,...,127782.664392,134921.061786,129793.897779,127858.445137,108759.06186,127744.197044,133723.530245,146651.755678,153534.536161,133964.837991
United Kingdom,Male,95.0,125.0,2049,2050,127765.330292,131992.359105,132745.174212,121566.569294,120010.799977,114127.489703,136240.266101,147709.447517,133317.489612,127169.484355,...,127529.973622,135115.785472,130017.37333,128346.358923,108768.814727,127682.196818,134255.13048,146732.723463,153968.447922,134047.815962
United Kingdom,Male,95.0,125.0,2050,2051,128213.393685,133220.130675,133421.143899,122248.029681,119754.120437,113654.996368,136919.776385,148869.875216,133983.846752,126969.448946,...,128519.07973,135934.950475,130583.543525,129590.537377,109219.537772,128856.357034,135299.299123,147386.69597,155815.638277,135717.26801


In [20]:
# For each demographic group, the "population scaling factor" is the
# ratio of the real-world population that we want to simulate in that
# group to the total number of people in that group. For Model 4 and
# above, this equals the initial prevalence of all AD disease states
# combined (preclinical + MCI + AD-dementia), since we are modeling the
# population of people with any stage of AD. Note that this is defined
# for the population at the beginning of the simulation, so there is
# only one year of data.
art_all_states_initial_prev = load_artifact_data('population.scaling_factor')
art_all_states_initial_prev.tail()
# NOTE: This data has two age groups, 95-100 and 100-105, instead of the
# single age group 95-125 that's in the population structure. I'm not
# sure why. I'm going to drop the 100-105 age group and match the 95-100
# age group with the 95-125 age group from above

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,draw_0,draw_1,draw_2,draw_3,draw_4,draw_5,draw_6,draw_7,draw_8,draw_9,...,draw_490,draw_491,draw_492,draw_493,draw_494,draw_495,draw_496,draw_497,draw_498,draw_499
location,sex,age_start,age_end,year_start,year_end,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
United Kingdom,Female,80,85,2025,2026,0.184057,0.205264,0.201615,0.203185,0.178318,0.175798,0.165233,0.152835,0.155335,0.174561,...,0.180658,0.197655,0.150986,0.137985,0.183084,0.153108,0.16389,0.135606,0.13778,0.178453
United Kingdom,Female,85,90,2025,2026,0.249261,0.251477,0.250982,0.279132,0.262662,0.236341,0.308766,0.292557,0.27689,0.286794,...,0.271585,0.290982,0.269875,0.259442,0.278545,0.276393,0.301692,0.294791,0.286761,0.287763
United Kingdom,Female,90,95,2025,2026,0.288238,0.285702,0.254878,0.292849,0.297609,0.266154,0.328095,0.294896,0.276581,0.281861,...,0.269374,0.26359,0.231749,0.22287,0.251981,0.253709,0.288297,0.272311,0.251585,0.264228
United Kingdom,Female,95,100,2025,2026,0.262417,0.263717,0.252117,0.27668,0.262065,0.260805,0.268706,0.25233,0.26228,0.255821,...,0.248522,0.244338,0.248601,0.242992,0.261974,0.263646,0.274336,0.246757,0.253818,0.256388
United Kingdom,Female,100,105,2025,2026,0.253172,0.257814,0.248932,0.272355,0.252686,0.253852,0.252204,0.245571,0.251487,0.24982,...,0.237925,0.215883,0.224073,0.222823,0.256208,0.267994,0.26968,0.248598,0.249409,0.246543


In [21]:
# There's only one year worth of data here
art_all_states_initial_prev.index.unique('year_end')

Int64Index([2026], dtype='int64', name='year_end')

## Now compute initial real-world all-state prevalence counts and model scale

In [22]:
def get_real_world_initial_population(
       population_structure,
       initial_prevalence,
       start_year=2022,
):
    years = initial_prevalence.index.unique('year_start')
    assert len(years) == 1, 'Unexpected years for initial prevalence!'
    year = years[0]
    # Use the specified start year for the population structure,
    # regardless of what single year is stored in the initial
    # prevalence. Rename year_start and year_end to properly match the
    # dataframes.
    initial_prevalence = (
        initial_prevalence
        .rename({year: start_year}, level='year_start')
        # NOTE: Only works if year_end = year_start + 1
        .rename({year+1: start_year+1}, level='year_end')
    )
    initial_prevalence_counts = (
        population_structure
        .query("year_start==@start_year")
        # Change end of oldest age group to match prevalence data
        .rename({125.0: 100.0}, level='age_end')
        * initial_prevalence
    ).dropna() # Drop age groups we don't have in sim
    return initial_prevalence_counts

art_all_states_initial_prev_counts = get_real_world_initial_population(
    pop_structure, art_all_states_initial_prev
)
art_all_states_initial_prev_counts.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,draw_0,draw_1,draw_2,draw_3,draw_4,draw_5,draw_6,draw_7,draw_8,draw_9,...,draw_490,draw_491,draw_492,draw_493,draw_494,draw_495,draw_496,draw_497,draw_498,draw_499
location,sex,age_start,age_end,year_start,year_end,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
United States of America,Male,75.0,80.0,2022,2023,279503.716159,241421.537335,303005.456325,245046.790482,291032.01973,292173.888069,319151.434213,322901.726876,320919.877307,312721.888359,...,340496.388986,183624.796107,303387.652317,257740.083278,264585.258062,286707.666817,288361.928754,281653.836405,283209.988963,237124.856198
United States of America,Male,80.0,85.0,2022,2023,295491.963153,334542.816129,352919.503418,327523.75573,309471.027115,296260.437344,265026.502337,256594.63654,268008.281527,311310.859435,...,328352.211159,321773.632827,270912.872088,241055.920798,321904.097725,272172.292983,286599.286178,235443.951067,230842.759084,276251.278427
United States of America,Male,85.0,90.0,2022,2023,265660.836082,279333.12336,259372.09353,273638.950137,256249.409965,220125.858697,295555.094216,283893.37546,262971.486971,293492.877529,...,304798.665607,274423.347832,285871.062247,260618.321976,271969.277566,292854.234936,324185.862549,293051.854129,265368.527577,293529.464736
United States of America,Male,90.0,95.0,2022,2023,135009.931912,140544.500518,117802.312175,135061.861965,138174.629778,113243.661152,140889.365668,133342.91333,123131.014317,132526.38297,...,129627.305356,108648.748859,105907.707572,95763.096448,111257.383359,119742.388165,139285.777478,121602.893171,101618.235854,117661.784595
United States of America,Male,95.0,100.0,2022,2023,29288.078689,30698.389085,30107.564684,32246.26041,30766.214728,29113.490401,29413.069566,29157.686021,30052.152467,30956.218671,...,32091.208571,26499.605071,31482.881082,29952.989616,32037.054667,33926.196355,34717.989582,29169.491828,28270.193829,31345.61671


In [23]:
# Ininitial simulated population per draw, from concept model
# TODO: Change this to 100 seeds once we get final runs
num_seeds = 100 # 5 seeds for V&V runs, 100 seeds for final runs
pop_per_seed = 20_000
initial_sim_pop = num_seeds * pop_per_seed

def calculate_model_scale(
        simulated_initial_population,
        real_world_initial_population,
    ):
    # Sum over age groups to get real-world population in each location
    total_real_world_initial_pop = (
        real_world_initial_population.groupby('location').sum())
    # Model scale is the ratio of our simulated population to the real-world
    # population at time 0
    model_scale = (
        simulated_initial_population / total_real_world_initial_pop)
    # This format (draws horizontally as column names, as strings) is
    # compatible with Artifacts
    return model_scale

# Compute model scale in Artifact format
art_model_scale = calculate_model_scale(
    initial_sim_pop, art_all_states_initial_prev_counts)
art_model_scale

Unnamed: 0_level_0,draw_0,draw_1,draw_2,draw_3,draw_4,draw_5,draw_6,draw_7,draw_8,draw_9,...,draw_490,draw_491,draw_492,draw_493,draw_494,draw_495,draw_496,draw_497,draw_498,draw_499
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Brazil,1.182478,1.062553,1.144685,1.078614,1.082753,1.121963,1.003505,1.068713,1.068239,1.220286,...,1.290358,1.060123,1.086799,1.233531,0.984699,1.172564,1.17257,1.039336,1.177133,1.124321
China,0.133424,0.130108,0.114296,0.115621,0.123448,0.119143,0.117581,0.124027,0.115181,0.126465,...,0.11968,0.125693,0.127735,0.134309,0.126757,0.123859,0.124902,0.119387,0.127926,0.123437
Germany,0.863308,0.892693,0.882412,0.87966,0.779863,0.862006,0.874659,0.883173,0.857328,0.946077,...,0.948708,0.848518,0.866878,0.95523,0.949687,0.931259,0.830667,0.891217,0.877104,0.868272
Israel,41.254628,40.310785,40.210343,40.057537,39.166235,44.012209,35.797113,38.209922,41.226551,40.837404,...,41.809532,40.213923,40.18052,45.482468,47.228409,38.099635,38.14532,34.621104,48.494263,40.043924
Japan,0.586866,0.574811,0.618108,0.553387,0.564055,0.540442,0.488238,0.511745,0.570487,0.564432,...,0.536199,0.591534,0.595133,0.650132,0.621924,0.603336,0.566684,0.576934,0.585316,0.56787
Spain,3.211223,3.418895,3.073433,2.871964,3.086059,3.23559,2.882333,2.904501,3.250294,2.987198,...,3.452614,3.070298,3.679101,3.409524,3.012754,3.228092,2.819038,2.934665,3.112781,3.208168
Sweden,13.614881,13.790971,11.796806,11.934329,12.544275,12.395964,11.855079,13.491354,13.650575,12.357657,...,13.678809,12.871456,13.393983,14.436751,12.37591,11.803971,12.87871,12.50728,14.370234,12.398862
Taiwan (Province of China),8.808232,7.720413,9.72566,8.51574,8.443536,8.972456,8.809221,9.256961,7.936908,8.524695,...,8.314534,8.305947,8.938951,9.810819,9.12507,8.416122,8.60279,8.931823,8.469918,8.560164
United Kingdom,2.099057,2.036624,2.077655,2.012311,2.061171,2.03326,2.023333,2.027407,1.98092,1.990865,...,1.905267,1.952344,2.208905,2.390574,2.156457,2.099913,1.927622,2.024944,2.021172,2.005293
United States of America,0.435214,0.429244,0.414357,0.431238,0.427998,0.4446,0.417452,0.416229,0.423805,0.406953,...,0.371698,0.461168,0.424932,0.475658,0.453406,0.434504,0.407813,0.435152,0.469132,0.445144


In [24]:
art_model_scale.T.describe()

location,Brazil,China,Germany,Israel,Japan,Spain,Sweden,Taiwan (Province of China),United Kingdom,United States of America
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.125453,0.123974,0.908049,41.10794,0.590351,3.157002,13.113169,8.80044,2.069689,0.440351
std,0.090995,0.007266,0.054319,3.48876,0.034167,0.206373,0.842793,0.52277,0.095687,0.022943
min,0.886671,0.103631,0.779863,32.489305,0.488238,2.719273,10.915902,7.563766,1.864453,0.371698
25%,1.065282,0.119137,0.86897,38.763905,0.566227,3.012984,12.45694,8.444214,2.003624,0.424886
50%,1.119439,0.123592,0.904125,40.83854,0.589194,3.14298,13.09135,8.792235,2.06502,0.439275
75%,1.185735,0.127938,0.94006,43.150152,0.611063,3.279462,13.654027,9.145342,2.133291,0.454833
max,1.420375,0.15145,1.115194,55.030837,0.694869,3.811998,15.716174,10.479234,2.390574,0.54264


In [25]:
# Reformat model scale to be compatible with simulation output: draws
# vertically in columns or index, as integers

# model_scale = (
#     art_model_scale
#     .rename_axis(columns='input_draw')
#     .pipe(lambda df: df.set_axis(
#         df.columns.str.removeprefix('draw_')
#         .astype(input_draw_dtype), axis=1))
#     .stack()
#     .sort_index()
#     .rename('value')
#     .reset_index()
#     .astype({'location': all_locations_dtype})
# )

def convert_to_sim_format(df, colname_to_dtype=colname_to_dtype):
    """Convert artifact data to a format compatible with sim output."""
    # TODO: Also convert age_start/age_end to age_group
    # input_draw_dtype = colname_to_dtype.get('input_draw', 'int')
    # colname_to_dtype = {c: dtype for c, dtype
    #                     in colname_to_dtype.items() if c in df.index.names}
    new_df = (
        df
        .rename_axis(columns='input_draw')
        .rename(columns=lambda s: int(s.removeprefix('draw_')))
        .stack()
        # .sort_index()
        .rename('value')
        .rename_axis(index={'year_start': 'event_year'})
        .reset_index()
        # Drop the year_end column if it exists
        .drop(columns='year_end', errors='ignore')
        .pipe(lambda df: df.astype(
            {c: dtype for c, dtype
             in colname_to_dtype.items() if c in df}
        ))
    )
    return new_df

model_scale = convert_to_sim_format(art_model_scale)
model_scale

Unnamed: 0,location,input_draw,value
0,Brazil,0,1.182478
1,Brazil,1,1.062553
2,Brazil,2,1.144685
3,Brazil,3,1.078614
4,Brazil,4,1.082753
...,...,...,...
4995,United States of America,495,0.434504
4996,United States of America,496,0.407813
4997,United States of America,497,0.435152
4998,United States of America,498,0.469132


In [26]:
model_scale.dtypes

location      category
input_draw       int16
value          float64
dtype: object

# Define functions to scale measures to real-world values, add rates, and generate final results

In [27]:
def scale_to_real_world(measure, model_scale=model_scale, ops=ops):
    """Divide the values in the `measure` dataframe by the values in
    `model_scale`, matching location and draw, and broadcasting across
    other columns in `measure`. This computes the value of the measure
    in the real-world population from the scaled-down version we get
    from the simulation.
    """
    draws = measure['input_draw'].unique()
    model_scale = model_scale.query("input_draw in @draws")
    measure = ops.value(measure)
    # NOTE: Reindexing preserves categoricals (in location column), but
    # results in all NaN's for some reason
    model_scale = ops.value(model_scale)#.reindex(measure.index)
    # scaled_measure = measure.divide(model_scale, axis=0).reset_index()
    scaled_measure = (measure / model_scale).reset_index()
    #.dropna() # Alternative to filtering draws above
    return scaled_measure

def calculate_rate(measure, population_structure=pop_structure, ops=ops):
    # Divide measure by total person time to get rate
    ...
    return measure

def summarize_and_beautify(
        df,
        disease_stage_column=None,
        # column_name_map={},
        model_scale=model_scale,
        population_structure=pop_structure,
        ops=ops,
    ):
    """Append rates, scale to real-world, summarize, rename columns,
    filter to desired columns, and put them in the right order.
    """
    # Calculate rates
    ...
    # Default column name map
    if disease_stage_column is None:
        disease_stage_column = 'sub_entity'
    column_name_map = {
        'event_year': 'Year',
        'age_group': 'Age',
        'location': 'Location',
        'sex': 'Sex',
        'scenario': 'Scenario',
        'measure': 'Measure',
        disease_stage_column: 'Disease Stage',
        'mean': 'Mean',
        'lower': '95% UI Lower',
        'upper': '95% UI Upper',
    }#.update(column_name_map) # This wasn't working for some reason

    disease_stage_name_map = {
        'alzheimers_blood_based_biomarker_state': 'Preclinical AD',
        'alzheimers_mild_cognitive_impairment_state': 'MCI due to AD',
        'alzheimers_disease_state' : 'AD Dementia'
    }
    scenario_name_map = {
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment',
    }
    column_order = [
        'Year', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario',
        'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper',
    ]
    current_time()
    # Do transformations
    df = (
        df
        # Append rates
        # .pipe(lambda df: df)
        # Scale to real-world values
        .pipe(scale_to_real_world, model_scale, ops)
        .pipe(lambda df: current_time() or df)
        # Summarize data
        .pipe(ops.summarize_draws)
        .reset_index()
        .pipe(lambda df: current_time() or df)
        # Rename columns
        .rename(columns=column_name_map)
        .replace(
            {'Disease Stage': disease_stage_name_map,
             'Scenario': scenario_name_map})
        [column_order]
    )
    return df

# Deaths and averted deaths

In [29]:
current_time()
# deaths.entity.unique(): ['alzheimers_disease_state', 'other_causes']
# Filter out other causes when loading since we don't need it
deaths_filter = [('entity', '=', 'alzheimers_disease_state')]
# deaths = load_sim_output('deaths', filters=deaths_filter)

# Stats for reading deaths.parquet:
# 51 seconds to load with 1 location group, 36 MB after concatenation,
# max 939 MB before aggregating seeds
# 60 seconds to load with 3 location groups, 154 MB after concatenation,
# max 306 MB before aggregating seeds
deaths = load_measure_from_batch_runs(
    'deaths', batch_run_dirs, locations, n_location_groups=1, filters=deaths_filter
)
print_memory_usage(deaths)
deaths.tail()

2025-11-01 00:49:55.060860
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/arti

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,value,location
1777495,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,deaths,baseline,Male,13502.0,Brazil
1777496,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,deaths,bbbm_testing,Female,26597.0,Brazil
1777497,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,deaths,bbbm_testing,Male,13502.0,Brazil
1777498,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,deaths,bbbm_testing_and_treatment,Female,26801.0,Brazil
1777499,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,deaths,bbbm_testing_and_treatment,Male,13598.0,Brazil


In [30]:
def process_deaths(deaths, ops=ops):
    """Preprocess the deaths dataframe and compute averted deaths."""
    # Filter to only deaths due to AD
    deaths = deaths.query("entity=='alzheimers_disease_state'")
    # Calculate averted deaths
    averted_deaths = (
        ops.averted(deaths, baseline_scenario='baseline')
        .assign(measure='Averted Deaths Associated with AD')
    )
    # Do transformations
    deaths = (
        deaths
        # Rename the measure
        .assign(measure='Deaths Associated with AD')
        # Concatenate deaths with averted deaths
        # TODO: Concatenate with rates also?
        .pipe(lambda df:
              # Use inner join to drop "subtracted_from" column added by
              # .averted
              pd.concat([df, averted_deaths], join='inner', ignore_index=True))
        .assign(Metric='Number')
        .pipe(convert_to_categorical)
    )
    return deaths

deaths_prepped = process_deaths(deaths)
deaths_prepped.tail()

49.775504 MB measure
16.595504 MB minuend
33.185504 MB subtrahend
10.674555 MB minuend re-indexed
21.339555 MB subtrahend re-indexed
21.339615 MB difference
37.925636 MB difference with reset index
39.110961 MB final difference


Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,value,location,Metric
2962495,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,460,Averted Deaths Associated with AD,bbbm_testing_and_treatment,Male,-93.0,Brazil,Number
2962496,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,Averted Deaths Associated with AD,bbbm_testing,Female,0.0,Brazil,Number
2962497,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,Averted Deaths Associated with AD,bbbm_testing,Male,0.0,Brazil,Number
2962498,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,Averted Deaths Associated with AD,bbbm_testing_and_treatment,Female,-204.0,Brazil,Number
2962499,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,Averted Deaths Associated with AD,bbbm_testing_and_treatment,Male,-96.0,Brazil,Number


## Summarize deaths and save to file

In [31]:
deaths_output = summarize_and_beautify(deaths_prepped.query(f"location.isin({locations})"), 'entity')
deaths_output

2025-11-01 00:50:47.617940
2025-11-01 00:50:51.848761
2025-11-01 00:52:36.564250


Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2022,Brazil,25_to_29,Female,AD Dementia,Reference,Deaths Associated with AD,Number,0.000000,0.000000,0.000000
1,2022,Brazil,25_to_29,Male,AD Dementia,Reference,Deaths Associated with AD,Number,0.000000,0.000000,0.000000
2,2022,Brazil,25_to_29,Female,AD Dementia,BBBM Testing Only,Deaths Associated with AD,Number,0.000000,0.000000,0.000000
3,2022,Brazil,25_to_29,Male,AD Dementia,BBBM Testing Only,Deaths Associated with AD,Number,0.000000,0.000000,0.000000
4,2022,Brazil,25_to_29,Female,AD Dementia,BBBM Testing and Treatment,Deaths Associated with AD,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
118495,2100,United States of America,95_plus,Male,AD Dementia,BBBM Testing and Treatment,Deaths Associated with AD,Number,7842.425247,5466.617453,10079.971108
118496,2100,United States of America,95_plus,Female,AD Dementia,BBBM Testing Only,Averted Deaths Associated with AD,Number,-16.075361,-512.079385,262.225998
118497,2100,United States of America,95_plus,Male,AD Dementia,BBBM Testing Only,Averted Deaths Associated with AD,Number,-1.280586,-203.198044,159.498288
118498,2100,United States of America,95_plus,Female,AD Dementia,BBBM Testing and Treatment,Averted Deaths Associated with AD,Number,-118.804105,-1110.092956,413.400961


In [32]:
# TODO: Check this
deaths_output.loc[
    (deaths_output['Year'] == 2055)
    & (deaths_output['Age'] == '80_to_84')
    & (deaths_output['Sex'] == 'Female')
    & (deaths_output['Disease Stage'] == 'AD Dementia')
    & (deaths_output['Metric'] == 'Number')
    & (deaths_output['Location'] == 'Brazil')
]

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
87230,2055,Brazil,80_to_84,Female,AD Dementia,Reference,Deaths Associated with AD,Number,41914.762614,30009.160489,54284.733239
87232,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,Deaths Associated with AD,Number,41776.01937,30009.160489,54035.348607
87234,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,Deaths Associated with AD,Number,41042.432826,29362.896384,53239.191126
87236,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing Only,Averted Deaths Associated with AD,Number,138.743243,0.0,1457.949701
87238,2055,Brazil,80_to_84,Female,AD Dementia,BBBM Testing and Treatment,Averted Deaths Associated with AD,Number,872.329787,158.493868,2500.727196


In [33]:
deaths_output.to_csv(output_dir / "deaths.csv", index=False)

# DALYs

In [34]:
# ylls.entity.unique(): ['alzheimers_disease_state', 'other_causes']
ylls_filter = [('entity', '==', 'alzheimers_disease_state')]
# ylls = load_sim_output('ylls', filters=ylls_filter)

ylls = load_measure_from_batch_runs(
    'ylls', batch_run_dirs, locations, n_location_groups=1, filters=ylls_filter
)
print(len(ylls), 'rows')
print_memory_usage(ylls)

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf',

703.901841 MB after read_parquet
4.271634 MB after aggregating seeds and converting dtypes
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt

In [35]:
# ylds.entity.unique():
# ['alzheimers_disease_and_other_dementias', 'treatment', 'all_causes']
ylds_filter = [('entity', '==', 'alzheimers_disease_and_other_dementias')]
# ylds = load_sim_output('ylls', filters=ylls_filter)

# Stats for reading ylds.parquet:
# 2m 48s seconds to load with 1 location group, 112 MB after concatenation,
# max 2816 MB before aggregating seeds
# 3m 44s to load with 3 location groups, 466 MB after concatenation,
# max 892 MB before aggregating seeds
ylds = load_measure_from_batch_runs(
    'ylds', batch_run_dirs, locations, n_location_groups=1, filters=ylds_filter
)
print(len(ylds), 'rows')
print_memory_usage(ylds)

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf',

2111.682914 MB after read_parquet
13.444865 MB after aggregating seeds and converting dtypes
/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/m

In [36]:
ylds.tail()

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,sub_entity,value,location
5332495,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2100,499,ylds,bbbm_testing_and_treatment,Female,alzheimers_mild_cognitive_impairment_state,733.93126,Brazil
5332496,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2100,499,ylds,bbbm_testing_and_treatment,Female,alzheimers_disease_state,41801.804723,Brazil
5332497,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2100,499,ylds,bbbm_testing_and_treatment,Male,alzheimers_blood_based_biomarker_state,0.0,Brazil
5332498,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2100,499,ylds,bbbm_testing_and_treatment,Male,alzheimers_mild_cognitive_impairment_state,390.033972,Brazil
5332499,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_and_other_dementias,cause,2100,499,ylds,bbbm_testing_and_treatment,Male,alzheimers_disease_state,19666.460225,Brazil


In [37]:
ylds.entity.unique()

['alzheimers_disease_and_other_dementias']
Categories (3, object): ['all_causes', 'alzheimers_disease_and_other_dementias', 'treatment']

In [38]:
ylls.tail()

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,value,location
1777495,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,ylls,baseline,Male,106200.27726,Brazil
1777496,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,ylls,bbbm_testing,Female,211690.810202,Brazil
1777497,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,ylls,bbbm_testing,Male,106200.27726,Brazil
1777498,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,ylls,bbbm_testing_and_treatment,Female,213301.006534,Brazil
1777499,95_plus,/mnt/team/simulation_science/pub/models/vivari...,alzheimers_disease_state,cause,2100,499,ylls,bbbm_testing_and_treatment,Male,106922.406785,Brazil


In [39]:
ylls.entity.unique()

['alzheimers_disease_state']
Categories (2, object): ['alzheimers_disease_state', 'other_causes']

In [None]:
def process_dalys(ylls, ylds, ops=ops):
    """Process YLLs and YLDs dataframes to get DALYs and averted DALYs.
    """
    # Filter to only YLLs and YLDs due to AD, and rename so the entity
    # is the same between the two, so that the VPHResults object will
    # add YLLs dand YLDs instead of keeping them separate 
    ylls = (
        ylls
        .query("entity=='alzheimers_disease_state'")
        # Choose an arbitrary diseas name
        .replace({'entity': {'alzheimers_disease_state': 'AD'}})
        # Add a sub_entity column to specify disease stage
        .assign(sub_entity='alzheimers_disease_state')
        # Assign 0 YLLs to the MCI state so that when we sum with YLDs,
        # DALYs for MCI will equal YLDs. If we didn't add these 0's, it
        # would just aggregate across disease states instead of keeping
        # them separate.
        .pipe(
            lambda df: pd.concat([df, df.assign(
                sub_entity='alzheimers_mild_cognitive_impairment_state',
                value=0.0
            )])
        )
        .pipe(convert_to_categorical)
    )
    ylds = (
        ylds
        .query("entity=='alzheimers_disease_and_other_dementias'")
        # Choose the same arbitrary diseas name
        .replace({'entity': {'alzheimers_disease_and_other_dementias': 'AD'}})
        .pipe(convert_to_categorical)
    )
    # Create a VPHResults object to calculate DALYs
    results = VPHResults(ylls=ylls, ylds=ylds, ops=ops)
    # Calculate DALYs and compress
    dalys = results.get_burden('dalys').pipe(convert_to_categorical)
    # print_memory_usage(dalys, 'dalys')
    # print(dalys.dtypes)

    # Calculate averted DALYs
    averted_dalys = (
        ops.averted(dalys, baseline_scenario='baseline')
        .assign(measure='Averted DALYs Associated with AD')
    )
    dalys = (
        dalys
        # Rename the measure
        .assign(measure='DALYs Associated with AD')
        # Concatenate deaths with averted DALYs
        # TODO: Concatenate with rates also?
        .pipe(lambda df:
              # Use inner join to drop "subtracted_from" column added by
              # .averted
              pd.concat([df, averted_dalys], join='inner', ignore_index=True))
        .assign(Metric='Number')
        .pipe(convert_to_categorical)
    )
    return dalys

In [41]:
dalys = process_dalys(ylls, ylds)
print_memory_usage(dalys)
print(len(dalys), 'rows')
dalys.tail()

101.326919 MB measure
48.001787 MB minuend
95.994287 MB subtrahend
33.782511 MB minuend re-indexed
67.555011 MB subtrahend re-indexed
67.555071 MB difference
67.554419 MB difference with reset index
71.109744 MB final difference
177.759698 MB 
8887500 rows


Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,measure,sex,sub_entity,input_draw,scenario,location,value,Metric
8887495,95_plus,/mnt/team/simulation_science/pub/models/vivari...,AD,cause,2100,Averted DALYs Associated with AD,Female,alzheimers_disease_state,499,bbbm_testing_and_treatment,Brazil,-1946.250193,Number
8887496,95_plus,/mnt/team/simulation_science/pub/models/vivari...,AD,cause,2100,Averted DALYs Associated with AD,Female,alzheimers_mild_cognitive_impairment_state,499,bbbm_testing_and_treatment,Brazil,-14.939114,Number
8887497,95_plus,/mnt/team/simulation_science/pub/models/vivari...,AD,cause,2100,Averted DALYs Associated with AD,Male,alzheimers_blood_based_biomarker_state,499,bbbm_testing_and_treatment,Brazil,0.0,Number
8887498,95_plus,/mnt/team/simulation_science/pub/models/vivari...,AD,cause,2100,Averted DALYs Associated with AD,Male,alzheimers_disease_state,499,bbbm_testing_and_treatment,Brazil,-874.528811,Number
8887499,95_plus,/mnt/team/simulation_science/pub/models/vivari...,AD,cause,2100,Averted DALYs Associated with AD,Male,alzheimers_mild_cognitive_impairment_state,499,bbbm_testing_and_treatment,Brazil,-5.937031,Number


## Do some quick checks

In [42]:
# Verify that DALYs == YLDs except in AD dementia state
df1 = dalys.query("sub_entity!='alzheimers_disease_state' and ~measure.str.contains('Averted')").drop(columns=['entity', 'measure', 'Metric'])
df2 = ylds.query("sub_entity!='alzheimers_disease_state'").drop(columns=['entity', 'measure'])
temp = ops.compare_values(df1, df2)
assert len(temp) == 0, 'DALYs differ from YLDs in MCI or BBBM state!'
temp 


age_group,artifact_path,entity_type,event_year,input_draw,location,scenario,sex,sub_entity


In [43]:
# Check that DALYs are always greater than YLDs
df1 = dalys.query("~measure.str.contains('Averted')").drop(columns=['entity', 'measure', 'Metric'])
df2 = ylds.drop(columns=['entity', 'measure'])
assert ((ops.value(df1) - ops.value(df2)) >= 0).value.all(), "DALYs are less than YLDs!"

## Summarize DALYs and save to file

In [44]:
dalys_output = summarize_and_beautify(dalys)
print_memory_usage(dalys_output)
print(len(dalys_output), 'rows')
dalys_output.tail()

2025-11-01 00:57:33.992076
2025-11-01 00:57:45.069457
2025-11-01 01:02:57.321016
34.986927 MB 
355500 rows


Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
355495,2100,United States of America,95_plus,Female,AD Dementia,BBBM Testing and Treatment,Averted DALYs Associated with AD,Number,-835.392332,-1889.98097,195.11613
355496,2100,United States of America,95_plus,Female,MCI due to AD,BBBM Testing and Treatment,Averted DALYs Associated with AD,Number,-9.988945,-30.675534,9.342266
355497,2100,United States of America,95_plus,Male,Preclinical AD,BBBM Testing and Treatment,Averted DALYs Associated with AD,Number,0.0,0.0,0.0
355498,2100,United States of America,95_plus,Male,AD Dementia,BBBM Testing and Treatment,Averted DALYs Associated with AD,Number,-266.617601,-680.819337,92.190045
355499,2100,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted DALYs Associated with AD,Number,-3.376825,-11.222019,4.071602


In [45]:
dalys_output.to_csv(output_dir / 'dalys.csv', index=False)

# Print when notebook finished running

In [46]:
current_time()

!date

2025-11-01 01:03:00.286596
Sat Nov  1 01:03:00 PDT 2025
