In [1]:
import pandas as pd
import numpy as np, os
import matplotlib.pyplot as plt

from pathlib import Path
import yaml
import re
import datetime

import pyarrow as pa
import pyarrow.parquet as pq

import gbd_mapping as gm
from vivarium import Artifact

from db_queries import get_ids, get_outputs, get_population, get_covariate_estimates
from get_draws.api import get_draws

import vivarium_helpers as vh
import vivarium_helpers.id_helper as idh
from vivarium_helpers.vph_output.operations import VPHOperator
from vivarium_helpers.utils import convert_to_categorical, constant_categorical, print_memory_usage

!date
!whoami
!pwd

Thu Nov 13 00:21:08 PST 2025
ndbs
/mnt/share/code/ndbs/vivarium_research_alzheimers/results_tables


# Define data directories

In [2]:
# Project directory
project_dir = Path('/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/')

# Output directory to store final results for client
output_dir = Path(r"J:\Project\simulation_science\alzheimers\results_in_progress".replace('\\', '/').replace('J:', '/snfs1'))
print(output_dir.exists())

# Output from multistate life table
mslt_output_dir = output_dir.parent / 'results_10_31_2025_mslt'
print(mslt_output_dir.exists())
output_dir

True
True


PosixPath('/snfs1/Project/simulation_science/alzheimers/results_in_progress')

In [3]:
# For testing: Run directory containing model 8.3 results for all
# locations
model_run_subdir = 'results/abie_consistent_model_test/united_states_of_america/2025_10_28_08_55_05/'

# Results directory for model 8.3, for testing
results_dirs = project_dir / model_run_subdir / 'results/'

# Artifact for models 8.3 - 8.7
artifact_model_number = '8.3'


In [4]:
# Model 8.4 results (final run for 10/31 intermediate results) 
# Each batch run contains all locations, 100 random seeds, and 3 or 4 draws
batch_run_dirs_8_4 = [
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_39_18',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_41_39', # deduplicated
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_29_20_45_13', # 4 draws
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_14_03_51',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_16_32_03',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_30_17_25_38',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_03_40',
    '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.4/model_spec/2025_10_31_01_09_31',
]

# Model 8.7 results (updated final run completed on 11/10/2025)
model_run_dir_8_7 = Path('/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec')

# Batch runs are stored in "timestamp" subdirectories of the model run
# directory
with os.scandir(model_run_dir_8_7) as entries:
    batch_run_dirs_8_7 = [entry.path for entry in entries]

batch_run_dirs = batch_run_dirs_8_7[:] # filter for testing

batch_results_dirs = []
for run_dir in batch_run_dirs:
    if run_dir.endswith('2025_10_29_20_41_39'):
        # One batch had to be deduplicated
        results_dir = run_dir + '/deduplicated_results'
    else:
        results_dir = run_dir + '/results'
    batch_results_dirs.append(results_dir)
batch_results_dirs

batch_results_dirs


['/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_05_15_36_29/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_06_06_42_37/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_07_09_05_46/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_10_07_44_18/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_06_06_43_53/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_06_13_00_54/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_05_16_28_26/results',
 '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_07_13_14_27/results',
 '/mnt/t

In [5]:
locations = [
    'United States of America',
    'Brazil',
    'China',
    'Germany',
    'Israel',
    'Japan',
    'Spain',
    'Sweden',
    'Taiwan (Province of China)',
    'United Kingdom',
]

def get_results_and_artifact_dicts(
        locations, results_dirs, artifact_model_number, project_dir):

    match results_dirs:
        case str() | Path():
            # Option 1: All locations concatenated in one results
            # directory
            location_to_results_dir = {'all': results_dirs}
        case list():
            # Option 2: One results directory per location
            location_to_results_dir = {
                loc: path for loc, path in zip(locations, results_dirs)}

    location_to_artifact_subdir = {
        loc: loc.lower().replace(' ', '_') for loc in locations}
    artifact_subpaths = [
        f'artifacts/model{artifact_model_number}/' + subdir + '.hdf' 
        for subdir in location_to_artifact_subdir.values()]

    location_to_artifact_path = {
        # Make sure artifact directory is stored as a string, not a Path
        # object, since it'll be stored as a string in the simulation
        # output, and we'll need to reverse this dict to map from
        # directories to locations
        loc: str(project_dir / subpath) for loc, subpath
        in zip(locations, artifact_subpaths)}

    return location_to_results_dir, location_to_artifact_path

location_to_results_dir, location_to_artifact_path = get_results_and_artifact_dicts(
    locations, results_dirs, artifact_model_number, project_dir
)
# This is needed to assign locations in the model results
location_to_artifact_path

{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf',
 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf',
 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf',
 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf',
 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf',
 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf',
 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf',
 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf',
 'Taiwan (Province of China)': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifact

# Define functions to load simulation results

### First, define some integer or ordered categorical data types for certain columns to save memory and make things work better

E.g., years work better as integers not strings.

In [6]:
# Order locations lexicographically
all_locations = [
    'United States of America',
    'Brazil',
    'China',
    'Germany',
    'Israel',
    'Japan',
    'Spain',
    'Sweden',
    'Taiwan (Province of China)',
    'United Kingdom',
]
all_locations_dtype = pd.CategoricalDtype(sorted(all_locations), ordered=True)

# int16 ranges from -32768 to 32767 (I think), which is sufficient to
# represent all years 2025-2100. uint8 only goes from 0 to 255, which is
# too small.
year_dtype = 'int16'

# Store draws as ints instead of categoricals since we'll be
# concatenating different draws from different results directories
input_draw_dtype = 'int16'

# Order age groups chronologically
age_groups = [f'{age}_to_{age + 4}' for age in range(25, 95, 5)] + ['95_plus']
age_group_dtype = pd.CategoricalDtype(age_groups, ordered=True)

# Order scenarios by complexity
scenarios = ['baseline', 'bbbm_testing', 'bbbm_testing_and_treatment']
scenario_dtype = pd.CategoricalDtype(scenarios, ordered=True)

colname_to_dtype = {
    'location': all_locations_dtype,
    'event_year': year_dtype,
    'age_group': age_group_dtype,
    'scenario': scenario_dtype,
    'input_draw': input_draw_dtype,
}

## Function to load results for all locations and aggregate random seeds

In [7]:
# Create an operator object - treat each random seed as a separate draw,
# and add location to the index
ops = VPHOperator(location_col=True)
# ops.index_cols.extend(['location', 'random_seed'])

def load_sim_output(
        measure,
        results_dict=location_to_results_dir,
        # Pass None to skip filtering locations (when None, must also
        # pass assign_location=False or raw=True)
        location_to_artifact_path=location_to_artifact_path,
        # specify dtypes of certain columns
        colname_to_dtype=colname_to_dtype,
        drop_superfluous_cols=True, # drop redundant or empty columns
        # Sets the 'read_dictionary' key of kwargs, which is passed to
        # pyarrow.parquet.read_table()
        force_parquet_dictionaries=True,
        force_pandas_categoricals=True,
        aggregate_seeds=True,
        assign_location=True,
        raw=False, # Overrides other parameters if True
        **kwargs, # keyword args to pass to .read_parquet
    ):
    """Load simulation output from .parquet files for all locations,
    optionally reducing the size of the data when possible. Returns
    concatenated outputs with a 'location' column added.
    """
    # Override optional transformations if raw=True
    if raw:
        drop_superfluous_cols = False
        force_parquet_dictionaries = False
        force_pandas_categoricals = False
        aggregate_seeds = False
        assign_location = False

    # Determine whether results for all locations are stored in same
    # directory, or if different locations have different results
    # directories
    match location_to_results_dir:
        case {'all': _}:
            all_locations_together = True
        case _:
            all_locations_together = False
    
    if all_locations_together and assign_location and location_to_artifact_path is None:
        raise ValueError(
            "Must provide mapping of artifacts to locations  when" \
            " assign_location=True and all locations are concatenated" \
            " in the simulation outputs."
        )

    dfs = []
    for location, directory in results_dict.items():

        parquet_file_path = Path(directory) / f'{measure}.parquet'
        # Read the Parquet file's schema to get column names and data types
        parquet_schema = pq.read_schema(parquet_file_path)

        if (
            all_locations_together
            and location_to_artifact_path is not None
        ):
            if 'artifact_path' in parquet_schema.names:
                # Filter to locations in list
                location_filter = (
                    'artifact_path',
                    'in',
                    list(location_to_artifact_path.values()),
                )
                user_filters = kwargs.get('filters') # Defaults to None
                kwargs['filters'] = add_parquet_AND_filter(
                    location_filter, user_filters)
                # TODO: Use logging not printing
                print(location_filter)
            else:
                print("'artifact_path' column missing from parquet file."
                      " Not filtering locations.")

        if force_parquet_dictionaries:
            # Read all columns as dictionaries except those containing 
            # floating point values
            kwargs['read_dictionary'] = [
                col.name for col in parquet_schema
                if not pa.types.is_floating(col.type)]

        # Read the parquet file
        df = pd.read_parquet(parquet_file_path, **kwargs)
        print_memory_usage(df, 'after read_parquet')

        if drop_superfluous_cols:
            # Drop redundant columns
            for col1, col2 in [
                ('input_draw', 'input_draw_number'),
                ('entity', 'sub_entity'),
            ]:
                if (col1 in df and col2 in df and df[col1].equals(df[col2])):
                    df.drop(columns=col2, inplace=True)
            # Drop empty columns (e.g., sub-entity)
            for col in df:
                if df[col].isna().all():
                    df.drop(columns=col, inplace=True)
        if colname_to_dtype is not None:
            df = df.astype(
                # Filter to avoid KeyError
                {c: dtype for c, dtype
                 in colname_to_dtype.items() if c in df},
                 # NOTE: If copy-on-write is enabled, copy keyword is
                 # ignored
                 copy=False)
        if force_pandas_categoricals:
            convert_to_categorical(
                df, exclude_cols=colname_to_dtype or (), inplace=True)
        if aggregate_seeds:
            # Use default index and value columns when aggregating
            df = vh.vph_output.operations.marginalize(df, 'random_seed')
        if assign_location:
            if all_locations_together:
                # NOTE: location_to_artifact_path is guaranteed not to
                # be None because assign_location and
                # all_locations_together are both True

                # Find or create a Categorical dtype with all locations
                location_dtype = colname_to_dtype.get(
                    'location',
                    pd.CategoricalDtype(
                        sorted(location_to_artifact_path.keys()), ordered=True)
                )
                # Invert the dictionary so we can map artifact paths to
                # locations
                artifact_path_to_location = {
                    path: loc for loc, path
                    in location_to_artifact_path.items()}
                if 'artifact_path' in df:
                    df['location'] = df['artifact_path'].map(
                        artifact_path_to_location).astype(location_dtype)
                else:
                    # In case the engineers change the DataFrame format
                    # on us...
                    print("'artifact_path' column missing from DataFrame."
                          " Not assigning locations.")
            else:
                # NOTE: location_to_results_dir contains actual
                # locations as keys (not 'all') since
                # all_locations_together is False

                # Find or create a Categorical dtype with all locations
                # to avoid converting back to object dtype.
                location_dtype = colname_to_dtype.get(
                    'location',
                    pd.CategoricalDtype(
                        sorted(location_to_results_dir.keys()), ordered=True)
                )
                df['location'] = location
                df['location'] = df['location'].astype(location_dtype)
        dfs.append(df)
    # TODO: Maybe if assign_location is False and all_locations_together
    # is also False (and there is more than one location?), we should
    # return a dict mapping locations to dataframes (or just a list of
    # dataframes?) instead of concatenating, since it won't be possible
    # to filter the resulting concatenated dataframe by location...
    df = pd.concat(dfs, ignore_index=True)
    return df
    
def add_parquet_AND_filter(new_filter, existing_filters):
    match existing_filters:
        case None:
            # No existing filters -- create a single AND group
            filters = [new_filter]
        case list([tuple((_, _, _)), *_]):
            # Existing filters consist of one AND group -- add the new filter
            filters = [new_filter, *existing_filters]
        case list([list([tuple((_, _, _)), *_]), *_]):
            # Add the filter to each AND group in the outer OR group
            filters = [[new_filter, *and_group] for and_group in existing_filters]
        case _:
            raise ValueError(f"Malformed parquet filter: {existing_filters}")
    return filters

def current_time():
    print(datetime.datetime.now())

class Timer:
    """Simple class to time code blocks using a context manager.
    Code modified from: https://stackoverflow.com/a/79354757/24446049
    """
    def __enter__(self):
        self._enter_time = datetime.datetime.now()

    def __exit__(self, *exc_args):
        self._exit_time = datetime.datetime.now()
        print(f"Elapsed time: {self._exit_time - self._enter_time}")

## Function to load and concatenate runs from multiple batches

In [8]:
def load_measure_from_batch_runs(
        measure,
        batch_results_dirs,
        locations=locations,
        # Allow loading locations in multiple groups to save memory
        n_location_groups=1,
        filter_burn_in_years=True,
        colname_to_dtype=colname_to_dtype,
        project_dir=project_dir,
        **kwargs
    ):
    """Load data from multiple batch runs, aggregate random seeds, and
    concatenate.
    """
    # aggregate seeds by default, and warn if False was passed
    if not kwargs.setdefault('aggregate_seeds', True):
        # Documentation for setdefault: If key is in the dictionary,
        # return its value. If not, insert key with a value of default
        # and return default.
        print("Warning: Not aggregating seeds, which may require lots of memory")
    if filter_burn_in_years:
        # Filter out years before 2025 because for model 8.4, years
        # 2022-2024 are for burn-in
        year_filter = ('event_year', '>=', '2025')
        # Add the year filter to the user filters
        user_filters = kwargs.get('filters') # Defaults to None
        kwargs['filters'] = add_parquet_AND_filter(year_filter, user_filters)

    dfs = []
    for results_dir in batch_results_dirs:
        print(results_dir)
        for i in range(n_location_groups):
            location_group = locations[i::n_location_groups]
            # print(location_group)
            location_to_results_dir, location_to_artifact_path = get_results_and_artifact_dicts(
                location_group, results_dir, artifact_model_number, project_dir
            )
            print(location_to_artifact_path)
            df = load_sim_output(
                measure, location_to_results_dir, location_to_artifact_path, colname_to_dtype, **kwargs
            )
            print_memory_usage(df, 'after aggregating seeds and converting dtypes')
            dfs.append(df)
    measure_df = pd.concat(dfs, ignore_index=True)
    print_memory_usage(measure_df, 'total')
    measure_df = measure_df.astype(colname_to_dtype)
    print_memory_usage(measure_df, 'after enforcing dtypes')
    return measure_df

# Load one artifact and define age bins

Looks like this is used in the model scale calculation, I'm not sure
whether it's necessary...

In [9]:
usa_artifact_path = location_to_artifact_path['United States of America']
usa_art = Artifact(usa_artifact_path)
print(usa_art.load('metadata.locations'))

# age_bins is an empty DataFrame with a MultiIndex storing age group data
age_bins = usa_art.load('population.age_bins')
age_dictionary = (
    age_bins
    .reset_index()
    .assign(age_group=lambda df: df['age_group_name'].str.replace(' ', '_'))
    # Filter to ages that actually appear in our sim
    .query("age_start >= 25")
)
# age_dictionary

['United States of America']


# Calculate model scale

In [10]:
start_year = 2022 # We changed from 2025 to 2022 in model 8.4
scale = pd.DataFrame()
for location in locations:
    artifact_path = location_to_artifact_path[location]
    art = Artifact(artifact_path)
    temp = art.load('population.structure').reset_index() 
    temp['location'] = location
    
    df_prev_pop = pd.merge(
        (
            art.load('population.scaling_factor').query("year_start == 2025")
            .rename({2025: start_year}, level='year_start')
            # NOTE: Only works if year_end = year_start + 1
            .rename({2026: start_year + 1}, level='year_end')
        ),
        art.load('population.structure').query("year_start==2025").droplevel(['year_start', 'year_end']),
        left_index=True,
        right_index=True,
        suffixes=['_prev', '_pop']
    )
    prev = ((df_prev_pop.filter(like='draw_').filter(like='_prev')
            * df_prev_pop.filter(like='draw_').filter(like='_pop').values).mean(axis=1)).sum(axis=0)
    # TODO: use draw-specific scale instead of mean
    
    # NOTE: Make sure population is correct! 100_000 for V&V runs, 20x
    # as large for final runs
    ratio = 20 * 100_000 / prev
    print(ratio, '\t', location)

    temp['ratio'] = ratio

    temp = temp.rename(columns={'year_start': 'event_year'})
    temp = temp.merge(age_dictionary, on=['age_start','age_end']) # Is this necessary?
    mini = temp.loc[temp['event_year'] == 2050]
    for year in range(2051, 2100):
        temp = pd.concat([temp, mini.assign(event_year=year)], ignore_index=True)
    scale = pd.concat([scale, temp], ignore_index=True)
scale.head()

0.413679834666799 	 United States of America
1.0255107074337457 	 Brazil
0.11064180130844312 	 China
0.8796601347851242 	 Germany
37.64460664479873 	 Israel
0.6024041920670191 	 Japan
3.134223403777501 	 Spain
12.492014266675683 	 Sweden
8.253950402276727 	 Taiwan (Province of China)
1.9844601234266048 	 United Kingdom


Unnamed: 0,location,sex,age_start,age_end,event_year,year_end,draw_0,draw_1,draw_2,draw_3,...,draw_494,draw_495,draw_496,draw_497,draw_498,draw_499,ratio,age_group_id,age_group_name,age_group
0,United States of America,Female,25.0,30.0,2021,2022,11211250.0,11669210.0,11351830.0,10881110.0,...,10920460.0,11323230.0,11662140.0,11038690.0,10522820.0,10899720.0,0.41368,10,25 to 29,25_to_29
1,United States of America,Female,25.0,30.0,2022,2023,11211130.0,11660510.0,11357430.0,10881320.0,...,10930500.0,11325370.0,11652530.0,11041420.0,10530330.0,10909250.0,0.41368,10,25 to 29,25_to_29
2,United States of America,Female,25.0,30.0,2023,2024,11213620.0,11653960.0,11366900.0,10885900.0,...,10940210.0,11329190.0,11643920.0,11047920.0,10544490.0,10922530.0,0.41368,10,25 to 29,25_to_29
3,United States of America,Female,25.0,30.0,2024,2025,11223640.0,11659970.0,11388270.0,10901210.0,...,10961140.0,11341260.0,11644920.0,11065020.0,10568790.0,10948600.0,0.41368,10,25 to 29,25_to_29
4,United States of America,Female,25.0,30.0,2025,2026,11250620.0,11682560.0,11429710.0,10935410.0,...,10999470.0,11372270.0,11666910.0,11101230.0,10608270.0,10993980.0,0.41368,10,25 to 29,25_to_29


# BBBM Test Counts

In [11]:
def dataframe_beutification_and_summarizing(df, measure_name):

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'
    # FIXME: Calculate rate for real
    # df_rate = df.copy()
    # df_rate['value'] = df_rate['value'] / 100_000
    # df_rate['Metric'] = 'Rate per 100,000'
    # df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            'sub_entity':'Disease Stage'})
    df['Measure'] = measure_name
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = 'Preclinical AD'

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw'], observed=True).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric'], observed=True).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df 
    column_order = ['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper'] 
    df = df[column_order]

    return df

In [None]:
# # Load model 8.3 for testing
# bbbm_tests = load_sim_output(
#     'counts_bbbm_tests',
#     )

# Load model 8.4 in batches
# 43s to load 2 batches using 1 location group
# 3.4 GB maximum memory for reading parquet
# 30 MB final dataframe for 2 batches (7 draws)
# 2m 38s to load all 9 batches of model 8.7 (25 draws)
with Timer():
    bbbm_tests = load_measure_from_batch_runs(
        'counts_bbbm_tests', batch_results_dirs, locations, n_location_groups=1
    )
print(len(bbbm_tests), 'rows')
bbbm_tests.head()

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_05_15_36_29/results
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/swed

Unnamed: 0,age_group,artifact_path,bbbm_test_results,entity,entity_type,event_year,input_draw,measure,scenario,sex,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,not_tested,bbbm_testing,testing,2025,169,counts_bbbm_tests,baseline,Female,0.0,Japan
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,not_tested,bbbm_testing,testing,2025,169,counts_bbbm_tests,baseline,Male,0.0,Japan
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,not_tested,bbbm_testing,testing,2025,169,counts_bbbm_tests,bbbm_testing,Female,0.0,Japan
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,not_tested,bbbm_testing,testing,2025,169,counts_bbbm_tests,bbbm_testing,Male,0.0,Japan
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,not_tested,bbbm_testing,testing,2025,169,counts_bbbm_tests,bbbm_testing_and_treatment,Female,0.0,Japan


In [None]:
# 2m 44s for 2 batches (7 draws)
# 2m 52s for all batches (25 draws)
# 3m 48s for all 9 batches of model 8.7 (25 draws)
with Timer():
    bbbm_tests_final = dataframe_beutification_and_summarizing(bbbm_tests, 'BBBM Test Counts')

Elapsed time: 0:03:47.517755


In [14]:
bbbm_tests_final['Year ID'].unique()

array([2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035,
       2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046,
       2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057,
       2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068,
       2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079,
       2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090,
       2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099])

In [15]:
bbbm_tests_final.loc[(bbbm_tests_final['Year ID'] == 2050) & (bbbm_tests_final['Age'] == '65_to_69') & (bbbm_tests_final['Sex'] == 'Female') & (bbbm_tests_final['Metric'] == 'Number')]

## Need to find an old V&V value to compare to. Can't find one quickly.

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
22548,2050,Brazil,65_to_69,Female,Preclinical AD,Reference,BBBM Test Counts,Number,0.0,0.0,0.0
22549,2050,Brazil,65_to_69,Female,Preclinical AD,BBBM Testing Only,BBBM Test Counts,Number,23586.374891,12340.387973,38868.048584
22550,2050,Brazil,65_to_69,Female,Preclinical AD,BBBM Testing and Treatment,BBBM Test Counts,Number,23586.374891,12340.387973,38868.048584
22638,2050,China,65_to_69,Female,Preclinical AD,Reference,BBBM Test Counts,Number,0.0,0.0,0.0
22639,2050,China,65_to_69,Female,Preclinical AD,BBBM Testing Only,BBBM Test Counts,Number,165387.039831,67057.838107,297925.373685
22640,2050,China,65_to_69,Female,Preclinical AD,BBBM Testing and Treatment,BBBM Test Counts,Number,165387.039831,67057.838107,297925.373685
22728,2050,Germany,65_to_69,Female,Preclinical AD,Reference,BBBM Test Counts,Number,0.0,0.0,0.0
22729,2050,Germany,65_to_69,Female,Preclinical AD,BBBM Testing Only,BBBM Test Counts,Number,8485.186159,4544.482399,12364.320685
22730,2050,Germany,65_to_69,Female,Preclinical AD,BBBM Testing and Treatment,BBBM Test Counts,Number,8485.186159,4544.482399,12364.320685
22818,2050,Israel,65_to_69,Female,Preclinical AD,Reference,BBBM Test Counts,Number,0.0,0.0,0.0


In [16]:
# NEED TO ADD IN BBBM TESTS FROM MSLT HERE
!ls $mslt_output_dir

2025_10_31_false_positive_bbbm_tests_final.csv
2025_10_31_improper_medication_uses_final.csv
2025_10_31_susceptible_bbbm_tests_final.csv


In [17]:
bbbm_tests_susceptible = pd.read_csv(mslt_output_dir / '2025_10_31_susceptible_bbbm_tests_final.csv', index_col=0)
bbbm_tests_susceptible.tail()

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
6075,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,BBBM Tests,Number,1467749.0,1362363.0,1549757.0
6076,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,BBBM Tests,Number,2317253.0,2153712.0,2449146.0
6077,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,BBBM Tests,Number,1876975.0,1744506.0,1983808.0
6078,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,BBBM Tests,Number,800184.2,743710.6,845728.7
6079,2100,United States of America,75_to_80,Male,Susceptible,BBBM Testing and Treatment,BBBM Tests,Number,1368315.0,1271745.0,1446196.0


In [18]:
bbbm_tests_false_positive = pd.read_csv(mslt_output_dir / '2025_10_31_false_positive_bbbm_tests_final.csv', index_col=0)
bbbm_tests_false_positive.tail()

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
6075,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,146774.863478,136236.289374,154975.73501
6076,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,231725.331175,215371.159718,244914.562719
6077,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,187697.518252,174450.639371,198380.795803
6078,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,80018.415676,74371.062048,84572.865579
6079,2100,United States of America,75_to_80,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,136831.490805,127174.516102,144619.60014


In [19]:
bbbm_tests_false_positive

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,60_to_65,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,0.000000,0.000000,0.000000
1,2025,Brazil,65_to_70,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,0.000000,0.000000,0.000000
2,2025,Brazil,70_to_75,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,0.000000,0.000000,0.000000
3,2025,Brazil,75_to_80,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,0.000000,0.000000,0.000000
4,2025,Brazil,60_to_65,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
6075,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,146774.863478,136236.289374,154975.735010
6076,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,231725.331175,215371.159718,244914.562719
6077,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,187697.518252,174450.639371,198380.795803
6078,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,BBBM False Positive Tests,Number,80018.415676,74371.062048,84572.865579


In [20]:
# Do final processing before saving

def duplicate_in_testing_scenario(df):
    """Copy the testing results from the 'BBBM Testing and Treatment'
    scenario to the 'BBBM Testing Only' scenario.
    """
    return pd.concat([df.assign(Scenario='BBBM Testing Only'), df], ignore_index=True)

bbbm_tests_output = (
    bbbm_tests_final
    # .query("Metric=='Number'")
    .rename(columns={'Year ID': 'Year'})
    .assign(Measure="BBBM Tests")
    .pipe(lambda df: pd.concat(
        [df,
         duplicate_in_testing_scenario(bbbm_tests_susceptible),
         duplicate_in_testing_scenario(
             bbbm_tests_false_positive.assign(Measure='BBBM Positive Tests')),
         ],
        ignore_index=True)
    )
)
bbbm_tests_output

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,BBBM Tests,Number,0.000000,0.000000,0.000000
1,2025,Brazil,25_to_29,Female,Preclinical AD,BBBM Testing Only,BBBM Tests,Number,0.000000,0.000000,0.000000
2,2025,Brazil,25_to_29,Female,Preclinical AD,BBBM Testing and Treatment,BBBM Tests,Number,0.000000,0.000000,0.000000
3,2025,Brazil,25_to_29,Male,Preclinical AD,Reference,BBBM Tests,Number,0.000000,0.000000,0.000000
4,2025,Brazil,25_to_29,Male,Preclinical AD,BBBM Testing Only,BBBM Tests,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
91815,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,BBBM Positive Tests,Number,146774.863478,136236.289374,154975.735010
91816,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,BBBM Positive Tests,Number,231725.331175,215371.159718,244914.562719
91817,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,BBBM Positive Tests,Number,187697.518252,174450.639371,198380.795803
91818,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,BBBM Positive Tests,Number,80018.415676,74371.062048,84572.865579


In [21]:
# Save BBBM testing results
bbbm_tests_output.to_csv(output_dir / 'bbbm_tests.csv', index=False)

# CSF and PET Testing

In [22]:
# # Load model 8.3 for testing
# csf_pet_tests = load_sim_output(
#     'counts_baseline_tests_among_eligible',
#     )

# Load model 8.4 in batches
# 56s to load 2 batches using 1 location group
# 4.5 GB maximum memory to read parquet
# 40 MB final dataframe for 2 batches (7 draws)
# 3m 24s to load all batches using 1 location group
# 144 MB final dataframe for all batches (25 draws)
with Timer():
    csf_pet_tests = load_measure_from_batch_runs(
        'counts_baseline_tests_among_eligible', batch_results_dirs, locations, n_location_groups=1
    )
print(len(csf_pet_tests), 'rows')
csf_pet_tests.head()

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_05_15_36_29/results
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/swed

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,testing_state,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,baseline_testing,testing,2025,169,counts_baseline_tests_among_eligible,baseline,Female,not_tested,0.0,Japan
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,baseline_testing,testing,2025,169,counts_baseline_tests_among_eligible,baseline,Female,csf,0.0,Japan
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,baseline_testing,testing,2025,169,counts_baseline_tests_among_eligible,baseline,Female,pet,0.0,Japan
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,baseline_testing,testing,2025,169,counts_baseline_tests_among_eligible,baseline,Female,bbbm,0.0,Japan
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,baseline_testing,testing,2025,169,counts_baseline_tests_among_eligible,baseline,Male,not_tested,0.0,Japan


In [23]:
def dataframe_beutification_and_summarizing(df, measure_name):

    df = df.loc[df.testing_state.isin(['csf','pet'])]
    df['testing_state'] = df['testing_state'].cat.remove_unused_categories()
    df = df.drop(columns=['measure'])

    df = df.rename(columns={'testing_state':'Measure'})

    df['Measure'] = df['Measure'].cat.rename_categories({
        'csf': 'CSF Test Counts',
        'pet': 'PET Test Counts'
    })

    # Add code to make new measures called 'Averted CSF tests' and 'Averted PET tests' by subtracting from baseline
    df_baseline = df.loc[df['scenario'] == 'baseline']
    df_baseline = df_baseline.rename(columns={'value':'baseline_value'})

    df_averted = df.copy().rename(columns={'value':'all_value'})
    df_averted = df_averted.merge(
        df_baseline[['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','Measure','baseline_value']],
        on=['artifact_path', 'entity_type', 'age_group','event_year','location','sex','input_draw','Measure'])
    df_averted['value'] = df_averted['baseline_value'] - df_averted['all_value']
    df_averted['Measure'] = df_averted['Measure'].cat.rename_categories({
        'CSF Test Counts': 'Averted CSF Test Counts',
        'PET Test Counts': 'Averted PET Test Counts'
    })
    df = pd.concat([df, df_averted], ignore_index=True)

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']    

    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'

    # # FIXME: Calculate rate for real
    # df_rate = df.copy()
    # df_rate['value'] = df_rate['value'] / 100_000
    # df_rate['Metric'] = 'Rate per 100,000'
    # df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario',
                            # 'sub_entity':'Disease Stage',
                            'testing_state':'Measure'})
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = 'MCI due to AD'

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw'], observed=True).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric'], observed=True).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df
    column_order = ['Year ID', 'Location', 'Age', 'Sex', 'Disease Stage', 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper']
    df = df[column_order]

    return df

In [None]:
# 11m 9s to run on 2 batches (7 draws)
# I don't know why this one takes so long...
# 15m 12s for all 9 batches of model 8.7 (25 draws)
with Timer():
    csf_pet_tests_final = dataframe_beutification_and_summarizing(csf_pet_tests, 'CSF and PET Test Counts')

Elapsed time: 0:15:11.560811


In [25]:
csf_pet_tests_final

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Averted CSF Test Counts,Number,0.000000,0.000000,0.000000
1,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Averted PET Test Counts,Number,0.000000,0.000000,0.000000
2,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,CSF Test Counts,Number,0.000000,0.000000,0.000000
3,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,PET Test Counts,Number,0.000000,0.000000,0.000000
4,2025,Brazil,25_to_29,Female,MCI due to AD,BBBM Testing Only,Averted CSF Test Counts,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
269995,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing Only,PET Test Counts,Number,772.094681,310.384963,1306.324251
269996,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted CSF Test Counts,Number,15.954367,6.285054,32.392200
269997,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted PET Test Counts,Number,22.722887,6.285054,39.644185
269998,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,CSF Test Counts,Number,529.394913,173.080711,946.625789


In [26]:
csf_pet_tests_final.loc[(csf_pet_tests_final['Year ID'] == 2050) & (csf_pet_tests_final['Age'] == '65_to_69') & (csf_pet_tests_final['Sex'] == 'Female') & (csf_pet_tests_final['Metric'] == 'Number')]

## Again, could not quickly locate an old V&V value to compare to. But appears reasonable?

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
90192,2050,Brazil,65_to_69,Female,MCI due to AD,Reference,Averted CSF Test Counts,Number,0.000000,0.000000,0.000000
90193,2050,Brazil,65_to_69,Female,MCI due to AD,Reference,Averted PET Test Counts,Number,0.000000,0.000000,0.000000
90194,2050,Brazil,65_to_69,Female,MCI due to AD,Reference,CSF Test Counts,Number,3218.025883,1713.682741,4724.280268
90195,2050,Brazil,65_to_69,Female,MCI due to AD,Reference,PET Test Counts,Number,3856.966067,2025.332339,6026.070674
90196,2050,Brazil,65_to_69,Female,MCI due to AD,BBBM Testing Only,Averted CSF Test Counts,Number,2346.031087,1104.815378,3405.132657
...,...,...,...,...,...,...,...,...,...,...,...
93439,2050,United States of America,65_to_69,Female,MCI due to AD,BBBM Testing Only,PET Test Counts,Number,1185.264446,436.086038,1967.705292
93440,2050,United States of America,65_to_69,Female,MCI due to AD,BBBM Testing and Treatment,Averted CSF Test Counts,Number,2012.087441,1004.158204,3013.925010
93441,2050,United States of America,65_to_69,Female,MCI due to AD,BBBM Testing and Treatment,Averted PET Test Counts,Number,2925.934258,1645.233688,4488.495315
93442,2050,United States of America,65_to_69,Female,MCI due to AD,BBBM Testing and Treatment,CSF Test Counts,Number,803.616643,279.926625,1471.669511


In [27]:
# Do final cleanup before saving
csf_pet_tests_output = (
    csf_pet_tests_final
    .rename(columns={'Year ID': 'Year'})
    .assign(Measure=lambda df:
            # Just say "Tests" instead of "Test Counts"
            df['Measure'].str.replace(' Counts' , 's')
    )
)
csf_pet_tests_output

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Averted CSF Tests,Number,0.000000,0.000000,0.000000
1,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,Averted PET Tests,Number,0.000000,0.000000,0.000000
2,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,CSF Tests,Number,0.000000,0.000000,0.000000
3,2025,Brazil,25_to_29,Female,MCI due to AD,Reference,PET Tests,Number,0.000000,0.000000,0.000000
4,2025,Brazil,25_to_29,Female,MCI due to AD,BBBM Testing Only,Averted CSF Tests,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
269995,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing Only,PET Tests,Number,772.094681,310.384963,1306.324251
269996,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted CSF Tests,Number,15.954367,6.285054,32.392200
269997,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,Averted PET Tests,Number,22.722887,6.285054,39.644185
269998,2099,United States of America,95_plus,Male,MCI due to AD,BBBM Testing and Treatment,CSF Tests,Number,529.394913,173.080711,946.625789


In [28]:
csf_pet_tests_output.Measure.unique()

array(['Averted CSF Tests', 'Averted PET Tests', 'CSF Tests', 'PET Tests'],
      dtype=object)

In [29]:
# Save CSF/PET testing results
csf_pet_tests_output.to_csv(output_dir / 'csf_pet_tests.csv', index=False)

# Medication Counts

Need initiation, discontinuation and completion

In [30]:
!ls /mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model7.4/united_states_of_america/2025_10_24_16_02_54/results/

counts_baseline_tests_among_eligible.parquet
counts_bbbm_tests.parquet
counts_new_simulants.parquet
counts_newly_eligible_for_bbbm_testing.parquet
deaths.parquet
person_time_alzheimers_disease_and_other_dementias.parquet
person_time_eligible_for_bbbm_testing.parquet
person_time_ever_eligible_for_bbbm_testing.parquet
person_time_treatment.parquet
transition_count_alzheimers_disease_and_other_dementias.parquet
transition_count_treatment.parquet
ylds.parquet
ylls.parquet


In [None]:
# # Load model 8.3 results for testing
# medication = load_sim_output(
#     'transition_count_treatment',
#     )

# Load model 8.4 in batches
# 1m 55s to load 2 batches using 1 location group
# 7.2 GB maximum memory to load parquet
# 80 MB final dataframe for 2 batches (7 draws)
# 7m 1s to load all 9 batches of model 8.7
with Timer():
    medication = load_measure_from_batch_runs(
        'transition_count_treatment', batch_results_dirs, locations, n_location_groups=1
    )
print(len(medication), 'rows')
medication.head()

/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/results/model8.7/model_spec/2025_11_05_15_36_29/results
{'United States of America': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', 'Brazil': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', 'China': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', 'Germany': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', 'Israel': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', 'Japan': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', 'Spain': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', 'Sweden': '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/swed

('artifact_path', 'in', ['/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/united_states_of_america.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/brazil.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/china.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/germany.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/israel.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/japan.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/spain.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/sweden.hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/artifacts/model8.3/taiwan_(province_of_china).hdf', '/mnt/team/simulation_science/pub/models/vivarium_csu_alzheimers/a

Unnamed: 0,age_group,artifact_path,entity,entity_type,event_year,input_draw,measure,scenario,sex,sub_entity,value,location
0,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,treatment,cause,2025,169,transition_count,baseline,Female,waiting_for_treatment_to_full_effect_long,0.0,Japan
1,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,treatment,cause,2025,169,transition_count,baseline,Female,waiting_for_treatment_to_full_effect_short,0.0,Japan
2,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,treatment,cause,2025,169,transition_count,baseline,Female,full_effect_long_to_waning_effect_long,0.0,Japan
3,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,treatment,cause,2025,169,transition_count,baseline,Female,full_effect_short_to_waning_effect_short,0.0,Japan
4,25_to_29,/mnt/team/simulation_science/pub/models/vivari...,treatment,cause,2025,169,transition_count,baseline,Female,waning_effect_long_to_no_effect_after_long,0.0,Japan


In [32]:
medication.sub_entity.unique()
# Medication initiation is "waiting_for_treatment_to_full_effect_long" and "waiting_for_treatment_to_full_effect_short"
# Medication discontinuation is "waiting_for_treatment_to_full_effect_short"
# Medication completion is "waiting_for_treatment_to_full_effect_long"

# Note: the discontinuation and completion perfectly sum to initiation,
# even though in reality some people may die while on treatment

['waiting_for_treatment_to_full_effect_long', 'waiting_for_treatment_to_full_effect_short', 'full_effect_long_to_waning_effect_long', 'full_effect_short_to_waning_effect_short', 'waning_effect_long_to_no_effect_after_long', 'waning_effect_short_to_no_effect_after_short', 'susceptible_to_treatment_to_waiting_for_treat..., 'susceptible_to_treatment_to_no_effect_never_t...]
Categories (8, object): ['full_effect_long_to_waning_effect_long', 'full_effect_short_to_waning_effect_short', 'susceptible_to_treatment_to_no_effect_never_t..., 'susceptible_to_treatment_to_waiting_for_treat..., 'waiting_for_treatment_to_full_effect_long', 'waiting_for_treatment_to_full_effect_short', 'waning_effect_long_to_no_effect_after_long', 'waning_effect_short_to_no_effect_after_short']

In [33]:
def dataframe_beutification_and_summarizing(df, measure_name):

    df = df.loc[df.sub_entity.isin(['waiting_for_treatment_to_full_effect_long','waiting_for_treatment_to_full_effect_short'])]
    df['sub_entity'] = df['sub_entity'].cat.remove_unused_categories()
    df = df.drop(columns=['measure'])

    df = df.rename(columns={'sub_entity':'Measure'})
 
    # # Syl's original code, using different definitions of completion and discontinuation:
    # # Combine the two initiation categories into one before renaming
    # df['Measure'] = df['Measure'].replace({
    #     'waiting_for_treatment_to_full_effect_long': 'Medication Initiation Counts',
    #     'waiting_for_treatment_to_full_effect_short': 'Medication Initiation Counts',
    #     'waning_effect_short_to_no_effect_after_short': 'Medication Discontinuation Counts',
    #     'waning_effect_long_to_no_effect_after_long': 'Medication Completion Counts'
    # })
    # df['Measure'] = df['Measure'].astype('category')

    # Define a Categorical dtype with all 3 categories to preserve
    # Categoricals upon concatenation
    medication_count_dtype = pd.CategoricalDtype(
        ['Medication Initiation Counts', 'Medication Completion Counts', 'Medication Discontinuation Counts'])
    df['Measure'] = df['Measure'].replace({
        'waiting_for_treatment_to_full_effect_long': 'Medication Completion Counts',
        'waiting_for_treatment_to_full_effect_short': 'Medication Discontinuation Counts',
    }).astype(medication_count_dtype)
    # initiation counts = completion counts + discontinuation counts, so
    # we copy the dataframe and rename these both 'initiation' so
    # they'll be added togeter when we do the groupby below
    df_initiation = df.replace(
        {'Measure': {'Medication Completion Counts': 'Medication Initiation Counts',
                     'Medication Discontinuation Counts': 'Medication Initiation Counts'}
        }).astype({'Measure': medication_count_dtype})
    df = pd.concat([df, df_initiation], ignore_index=True)

    # Add in the scale factor multiplication
    df['event_year'] = df['event_year'].astype(int)
    df = df.merge(
        scale[['location','sex','age_group','ratio','event_year']],
        on=['location','sex','age_group','event_year'])
    df['value'] = df['value'] / df['ratio']
    
    # Need to set this up for number and rate to be included 
    df['Metric'] = 'Number'

    # # FIXME: Calculate rate for real
    # df_rate = df.copy()
    # df_rate['value'] = df_rate['value'] / 100_000
    # df_rate['Metric'] = 'Rate per 100,000'
    # df = pd.concat([df, df_rate], ignore_index=True)

    # Renaming, dropping columns, and recategorising
    df = df.rename(columns={'event_year': 'Year ID',
                            'age_group': 'Age',
                            'location': 'Location',
                            'sex':'Sex',
                            'scenario':'Scenario'})
    df['Scenario'] = df['Scenario'].cat.rename_categories({
        'baseline': 'Reference',
        'bbbm_testing': 'BBBM Testing Only',
        'bbbm_testing_and_treatment' : 'BBBM Testing and Treatment'
    })
    df['Disease Stage'] = 'Preclinical AD'

    # Now we summarize the data
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric', 'input_draw'], observed=True).value.sum().reset_index()
    df = df.groupby(['Year ID', 'Location', 'Age', 'Sex' , 'Disease Stage' , 'Scenario', 'Measure', 'Metric'], observed=True).value.describe(percentiles=[0.025,0.975]).reset_index()

    df = df.rename(columns={'mean': 'Mean',
                            '2.5%': '95% UI Lower',
                            '97.5%': '95% UI Upper'})

    #Reorder the columns in df
    column_order = ['Year ID', 'Location', 'Age', 'Sex', 'Disease Stage', 'Scenario', 'Measure', 'Metric', 'Mean', '95% UI Lower', '95% UI Upper']
    df = df[column_order]

    return df

In [None]:
# 8m 28s for 2 batches (7 draws)
# 11m 20s for all 9 batches of 8.7 (25 draws)
with Timer():
    medication_final = dataframe_beutification_and_summarizing(medication, 'Medication Counts')

Elapsed time: 0:11:20.288171


In [35]:
medication_final.loc[(medication_final['Year ID'] == 2060) & (medication_final['Age'] == '80_to_84') & (medication_final['Sex'] == 'Female') & (medication_final['Metric'] == 'Number')]

## Haven't really validated these but look reasonable? I had to bump up the age group to get non-zero values for medication completion. We should confirm what qualifies as "completion" in the model. 

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
94698,2060,Brazil,80_to_84,Female,Preclinical AD,Reference,Medication Completion Counts,Number,0.000000,0.000000,0.000000
94699,2060,Brazil,80_to_84,Female,Preclinical AD,Reference,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
94700,2060,Brazil,80_to_84,Female,Preclinical AD,Reference,Medication Initiation Counts,Number,0.000000,0.000000,0.000000
94701,2060,Brazil,80_to_84,Female,Preclinical AD,BBBM Testing Only,Medication Completion Counts,Number,0.000000,0.000000,0.000000
94702,2060,Brazil,80_to_84,Female,Preclinical AD,BBBM Testing Only,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
97132,2060,United States of America,80_to_84,Female,Preclinical AD,BBBM Testing Only,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
97133,2060,United States of America,80_to_84,Female,Preclinical AD,BBBM Testing Only,Medication Initiation Counts,Number,0.000000,0.000000,0.000000
97134,2060,United States of America,80_to_84,Female,Preclinical AD,BBBM Testing and Treatment,Medication Completion Counts,Number,2103.172374,916.650918,3730.421139
97135,2060,United States of America,80_to_84,Female,Preclinical AD,BBBM Testing and Treatment,Medication Discontinuation Counts,Number,227.712332,105.878983,394.991455


In [36]:
971.223404 + 109.213877	

1080.437281

In [37]:
medication_final.dtypes

Year ID             int64
Location           object
Age                object
Sex                object
Disease Stage      object
Scenario         category
Measure          category
Metric             object
Mean              float64
95% UI Lower      float64
95% UI Upper      float64
dtype: object

In [38]:
medication_final.loc[(medication_final['Year ID'] == 2060) & (medication_final['Age'] == '75_to_79') & (medication_final['Sex'] == 'Female') & (medication_final['Metric'] == 'Number')]

Unnamed: 0,Year ID,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
94680,2060,Brazil,75_to_79,Female,Preclinical AD,Reference,Medication Completion Counts,Number,0.000000,0.000000,0.000000
94681,2060,Brazil,75_to_79,Female,Preclinical AD,Reference,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
94682,2060,Brazil,75_to_79,Female,Preclinical AD,Reference,Medication Initiation Counts,Number,0.000000,0.000000,0.000000
94683,2060,Brazil,75_to_79,Female,Preclinical AD,BBBM Testing Only,Medication Completion Counts,Number,0.000000,0.000000,0.000000
94684,2060,Brazil,75_to_79,Female,Preclinical AD,BBBM Testing Only,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
97114,2060,United States of America,75_to_79,Female,Preclinical AD,BBBM Testing Only,Medication Discontinuation Counts,Number,0.000000,0.000000,0.000000
97115,2060,United States of America,75_to_79,Female,Preclinical AD,BBBM Testing Only,Medication Initiation Counts,Number,0.000000,0.000000,0.000000
97116,2060,United States of America,75_to_79,Female,Preclinical AD,BBBM Testing and Treatment,Medication Completion Counts,Number,19072.720831,13133.345028,27008.326401
97117,2060,United States of America,75_to_79,Female,Preclinical AD,BBBM Testing and Treatment,Medication Discontinuation Counts,Number,2136.918278,1473.119908,2911.430288


In [39]:
# APPEND MEDICATION USE AMONG SUSCEPTIBLE
!ls $mslt_output_dir

2025_10_31_false_positive_bbbm_tests_final.csv
2025_10_31_improper_medication_uses_final.csv
2025_10_31_susceptible_bbbm_tests_final.csv


In [40]:
medication_susceptible = pd.read_csv(mslt_output_dir / '2025_10_31_improper_medication_uses_final.csv', index_col=0)
medication_susceptible

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,60_to_65,Female,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,0.000000,0.000000,0.000000
1,2025,Brazil,65_to_70,Female,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,0.000000,0.000000,0.000000
2,2025,Brazil,70_to_75,Female,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,0.000000,0.000000,0.000000
3,2025,Brazil,75_to_80,Female,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,0.000000,0.000000,0.000000
4,2025,Brazil,60_to_65,Male,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
6075,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,44032.459043,40870.886812,46492.720503
6076,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,69517.599352,64611.347915,73474.368816
6077,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,56309.255475,52335.191811,59514.238741
6078,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,Improper Medication Uses,Number,24005.524703,22311.318614,25371.859674


In [41]:
medication_susceptible.Measure.unique()

array(['Improper Medication Uses'], dtype=object)

In [42]:
# Do final cleanup before saving
medication_output = (
    medication_final
    .rename(columns={'Year ID': 'Year'})
    .assign(Measure=lambda df:
            # Remove the word "Counts" from measures
            df['Measure'].str.replace(' Counts' , '')
    )
    .pipe(lambda df: pd.concat(
        [df, medication_susceptible.assign(Measure='Medication Initiation')],
        ignore_index=True
        )
    )
)
medication_output

Unnamed: 0,Year,Location,Age,Sex,Disease Stage,Scenario,Measure,Metric,Mean,95% UI Lower,95% UI Upper
0,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,Medication Completion,Number,0.000000,0.000000,0.000000
1,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,Medication Discontinuation,Number,0.000000,0.000000,0.000000
2,2025,Brazil,25_to_29,Female,Preclinical AD,Reference,Medication Initiation,Number,0.000000,0.000000,0.000000
3,2025,Brazil,25_to_29,Female,Preclinical AD,BBBM Testing Only,Medication Completion,Number,0.000000,0.000000,0.000000
4,2025,Brazil,25_to_29,Female,Preclinical AD,BBBM Testing Only,Medication Discontinuation,Number,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
208575,2100,United States of America,75_to_80,Female,Susceptible,BBBM Testing and Treatment,Medication Initiation,Number,44032.459043,40870.886812,46492.720503
208576,2100,United States of America,60_to_65,Male,Susceptible,BBBM Testing and Treatment,Medication Initiation,Number,69517.599352,64611.347915,73474.368816
208577,2100,United States of America,65_to_70,Male,Susceptible,BBBM Testing and Treatment,Medication Initiation,Number,56309.255475,52335.191811,59514.238741
208578,2100,United States of America,70_to_75,Male,Susceptible,BBBM Testing and Treatment,Medication Initiation,Number,24005.524703,22311.318614,25371.859674


In [43]:
medication_output.Measure.unique()

array(['Medication Completion', 'Medication Discontinuation',
       'Medication Initiation'], dtype=object)

In [44]:
medication_output.Metric.unique()

array(['Number'], dtype=object)

In [45]:
# Save output
medication_output.to_csv(output_dir / 'medication.csv', index=False)

# Print time when notebook finishes running

It took about 45 minutes to run on all 9 batches of model 8.7.

In [46]:
!date

Thu Nov 13 01:04:59 PST 2025
