In [1]:
import pandas as pd, numpy as np
import vivarium_output_loader as vol
import lsff_output_processing as lop
import lsff_summarizer
from lsff_summarizer import LSFFOutputSummarizer

!whoami
!date

ndbs
Thu Apr 23 14:55:52 PDT 2020


In [2]:
%load_ext autoreload
%autoreload 2

# Check test runs that added state person time for VAD and NTDs

### Main findings

1. It looks like we have all the expected state person time columns, including for VAD and NTDs. Woo hoo!

2. The `output.hdf` files include the year 2025, but the count space data only has 2020 - 2024.

3. It looks like the vitamin A scaleup decreased vitamin A deficiency, but the folic acid scaleup had no effect on neural tube defects.

4. There is a consistent problem (at least in the count space data), where the prevalences of the different states of a disease add up to more than 1. More details:

    a. For example, in draw 357 in the baseline scenario, the prevalence of `susceptible_to_vitamin_a_deficiency`  in 2020 is 0.736274, and the prevalence of `vitamin_a_deficiency` in 2020 is 0.29047. These two prevalences should add up to 1, but 0.736274+0.29047 = 1.026744.
    
    b. The problem appears in both test runs and for several draws.
    
    c. It definitely happens for vitamin A deficiency, diarrheal diseases, and neural tube defects. I didn't check other causes.
   
    d. There was at least one draw where the single state `susceptible_to_neural_tube_defects` had prevalence greater than 1. Namely, in draw 357 in the baseline scenario, the prevalence of this state in 2020 is 1.026232. This should be impossible.
   
    e. The next thing to do would be to check the `output.hdf` files to see if the corresponding state person time columns add up to the total person time columns. I have not done this yet.
    
    f. I checked the sum of prevalences for two different diseases and got 1.026744 and 1.026748. If we are always getting (nearly) the same number, that could be another clue, but I haven't checked whether the sum is always close to this value.
   
    g. This seems like it is probably a bug, though it's possible there could be something unintuitive happening either with the aggregation over strata or the way we compute prevalence from person time. I have not yet thoroughly considered this possibility.


## 1. Define directories

In [22]:
base_directory = '/share/costeffectiveness/results/vivarium_conic_lsff/00_bugfix/'

vad_path = base_directory + 'vad_prevalence_in_outputs/india_4_22'
ntd_path = base_directory + 'ntd_state_persontime_in_results/india_4_23'

# models_modelruns = {
#     'vad_prevalence_in_outputs': 'india_4_22',
#     'ntd_state_persontime_in_results': 'india_4_23'
# }

In [17]:
# vad_directory = ('/share/costeffectiveness/results/vivarium_conic_lsff/00_bugfix/'
#                 + 'vad_prevalence_in_outputs/india_4_22'
#                  )
# ntd_directory = ('/share/costeffectiveness/results/vivarium_conic_lsff/00_bugfix/'
#                  'ntd_state_persontime_in_results/india_4_23'
#                 )
#     

# Who knew? You can concatenate strings without using +
x = 'a' 'b'
x

'ab'

In [18]:
# But this doesn't work, i.e. you can only use
# string literals, not variables:
# y = x 'c'

## 2. Load `output.hdf` files

In [23]:
vad_raw_output = pd.read_hdf(f'{vad_path}/output.hdf')
vad_raw_output.head()
# raw_output = vol.load_by_location_and_rundate(base_directory, models_modelruns)
# raw_output.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_population_untracked,total_population_tracked,total_population,diarrheal_diseases_prevalent_cases_at_sim_end,susceptible_to_diarrheal_diseases_event_count,diarrheal_diseases_event_count,measles_prevalent_cases_at_sim_end,susceptible_to_measles_event_count,measles_event_count,recovered_from_measles_event_count,...,ylls_due_to_diarrheal_diseases_in_2023_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2023_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_other_causes_in_2023_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_diarrheal_diseases_in_2024_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_diarrheal_diseases_in_2024_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_diarrheal_diseases_in_2024_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_measles_in_2024_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,years_of_life_lost,fortification_intervention.scenario
input_draw_number,random_seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
946.0,35.0,10510.0,10048.0,20558.0,2330.0,9990.0,12967.0,786.0,0.0,882.0,664.0,...,87.402629,87.30598,87.209331,87.358697,174.251724,86.381678,86.113577,169.198603,97257.562387,baseline
357.0,35.0,10446.0,9847.0,20293.0,2192.0,9713.0,12624.0,345.0,0.0,396.0,278.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105026.21539,baseline
946.0,21.0,10478.0,9771.0,20249.0,2190.0,9727.0,12555.0,733.0,0.0,849.0,625.0,...,87.428987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99802.431329,baseline
357.0,45.0,10526.0,9882.0,20408.0,2209.0,9830.0,12723.0,312.0,0.0,357.0,252.0,...,0.0,174.559243,0.0,87.314766,87.332339,0.0,0.0,85.180188,102628.097364,folic_acid_fortification_scale_up
357.0,69.0,10492.0,9907.0,20399.0,2078.0,9248.0,11947.0,272.0,0.0,312.0,241.0,...,0.0,174.506525,0.0,87.279621,0.0,86.232733,0.0,170.737703,91328.468433,vitamin_a_fortification_scale_up


In [24]:
ntd_raw_output = pd.read_hdf(f'{ntd_path}/output.hdf')
ntd_raw_output.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_population_untracked,total_population_tracked,total_population,diarrheal_diseases_prevalent_cases_at_sim_end,susceptible_to_diarrheal_diseases_event_count,diarrheal_diseases_event_count,measles_prevalent_cases_at_sim_end,susceptible_to_measles_event_count,measles_event_count,recovered_from_measles_event_count,...,ylls_due_to_lower_respiratory_infections_in_2022_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_diarrheal_diseases_in_2023_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2023_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2023_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_diarrheal_diseases_in_2024_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2024_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_measles_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,ylls_due_to_lower_respiratory_infections_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,years_of_life_lost,fortification_intervention.scenario
input_draw_number,random_seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
357.0,44.0,10430.0,9923.0,20353.0,2214.0,9617.0,12481.0,332.0,0.0,378.0,271.0,...,87.323553,87.323553,87.182973,87.270835,84.594337,87.314766,86.987389,86.03414,100283.891415,baseline
946.0,35.0,10510.0,10048.0,20558.0,2330.0,9990.0,12967.0,786.0,0.0,882.0,664.0,...,0.0,0.0,0.0,87.30598,0.0,87.016034,0.0,255.560422,97257.562387,folic_acid_fortification_scale_up
357.0,69.0,10471.0,9928.0,20399.0,2260.0,9872.0,12780.0,329.0,0.0,375.0,293.0,...,174.585601,0.0,0.0,0.0,0.0,174.453808,0.0,85.835546,93962.083616,folic_acid_fortification_scale_up
602.0,21.0,10440.0,9901.0,20341.0,2094.0,9463.0,12305.0,1520.0,0.0,1713.0,1284.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105893.227429,baseline
602.0,69.0,10511.0,10000.0,20511.0,2239.0,9642.0,12558.0,1567.0,0.0,1755.0,1324.0,...,87.332339,0.0,0.0,0.0,84.435462,0.0,0.0,0.0,102365.399697,baseline


## 3. Load count space data

In [26]:
vad_data = vol.load_transformed_count_data(f'{vad_path}/count_data')
vad_data.keys()

dict_keys(['gestational_age', 'transition_count', 'deaths', 'state_person_time', 'births_with_ntd', 'population', 'person_time', 'ylls', 'ylds', 'births', 'birth_weight'])

In [27]:
ntd_data = vol.load_transformed_count_data(f'{ntd_path}/count_data')
ntd_data.keys()

dict_keys(['gestational_age', 'transition_count', 'deaths', 'state_person_time', 'births_with_ntd', 'population', 'person_time', 'ylls', 'ylds', 'births', 'birth_weight'])

## 4. Print column reports for raw output

In [29]:
vad_output = LSFFOutputSummarizer(vad_raw_output)
vad_output.print_column_report()

Number of data columns in output: 14501
Total number of columns captured in categories: 14501

Number of columns in each category:
 {'input_draw': 1, 'random_seed': 1, 'location': 0, 'intervention': 1, 'run_time': 0, 'diseases_at_end': 4, 'transition_count': 3465, 'population': 5, 'person_time': 432, 'state_person_time': 3456, 'treated_days': 0, 'mortality': 2160, 'total_daly': 2, 'yld': 2592, 'yll': 2160, 'categorical_risk': 0, 'birth_prevalence': 108, 'live_births': 108, 'proportion': 2, 'distribution_mean': 2, 'distribution_std_dev': 2, 'prevalence': 0} 

Missing (0 data column(s) not captured in a category):
 set()

Repeated (0 data column(s) appearing in more than one category):
 {}

Empty categories (5 categories with no matching data columns):
 ['location', 'run_time', 'treated_days', 'categorical_risk', 'prevalence']


In [30]:
ntd_output = LSFFOutputSummarizer(ntd_raw_output)
ntd_output.print_column_report()

Number of data columns in output: 15365
Total number of columns captured in categories: 15365

Number of columns in each category:
 {'input_draw': 1, 'random_seed': 1, 'location': 0, 'intervention': 1, 'run_time': 0, 'diseases_at_end': 4, 'transition_count': 3465, 'population': 5, 'person_time': 432, 'state_person_time': 4320, 'treated_days': 0, 'mortality': 2160, 'total_daly': 2, 'yld': 2592, 'yll': 2160, 'categorical_risk': 0, 'birth_prevalence': 108, 'live_births': 108, 'proportion': 2, 'distribution_mean': 2, 'distribution_std_dev': 2, 'prevalence': 0} 

Missing (0 data column(s) not captured in a category):
 set()

Repeated (0 data column(s) appearing in more than one category):
 {}

Empty categories (5 categories with no matching data columns):
 ['location', 'run_time', 'treated_days', 'categorical_risk', 'prevalence']


## 5. See what the scenario columns look like

In [36]:
vad_output.subdata('intervention').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,fortification_intervention.scenario
input_draw_number,random_seed,Unnamed: 2_level_1
946.0,35.0,baseline
357.0,35.0,baseline
946.0,21.0,baseline
357.0,45.0,folic_acid_fortification_scale_up
357.0,69.0,vitamin_a_fortification_scale_up
946.0,69.0,vitamin_a_fortification_scale_up
357.0,12.0,baseline
602.0,35.0,folic_acid_fortification_scale_up
946.0,12.0,vitamin_a_fortification_scale_up
946.0,22.0,folic_acid_fortification_scale_up


In [37]:
ntd_output.subdata('intervention').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,fortification_intervention.scenario
input_draw_number,random_seed,Unnamed: 2_level_1
357.0,44.0,baseline
946.0,35.0,folic_acid_fortification_scale_up
357.0,69.0,folic_acid_fortification_scale_up
602.0,21.0,baseline
602.0,69.0,baseline
946.0,22.0,folic_acid_fortification_scale_up
946.0,22.0,vitamin_a_fortification_scale_up
602.0,22.0,folic_acid_fortification_scale_up
357.0,45.0,baseline
946.0,45.0,baseline


## 6. Check state person time columns for both runs

It looks like this is the only category where there's a difference

In [31]:
# Total number of non-state-person-time columns for the two models is equal
(15365 - 4320) == (14501 - 3456)

True

In [45]:
# There are 864 more columns in the NTD output (all in state person time)
4320 - 3456

864

In [63]:
# 864 = 2^5 * 3^3
864/4/4/2/3/3/3

1.0

In [67]:
2**5 * 3**3

864

In [80]:
# 2 NTD states x 5 years x 2 sexes x 4 age groups x 3 FA coverage groups x 3 VA coverage groups
2*5*2*4*3*3

720

So there are 144 extra columns...

In [76]:
864-720

144

### Try to find the 144 columns unaccounted for in `ntd_output`

Ahh, there are 6 years, not 5. But the count space data only has 5 years (2020 - 2024) -- see below.

In [78]:
ntd_output.subdata('state_person_time').filter(regex='neural_tube_defects').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,susceptible_to_neural_tube_defects_person_time_in_2020_among_male_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_female_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_male_in_age_group_late_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_female_in_age_group_late_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_male_in_age_group_post_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_female_in_age_group_post_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_male_in_age_group_1_to_4_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_neural_tube_defects_person_time_in_2020_among_female_in_age_group_1_to_4_folic_acid_unknown_vitamin_a_uncovered,neural_tube_defects_person_time_in_2020_among_male_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,neural_tube_defects_person_time_in_2020_among_female_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,...,susceptible_to_neural_tube_defects_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,susceptible_to_neural_tube_defects_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_male_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_female_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_male_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_female_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,neural_tube_defects_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered
input_draw_number,random_seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
357.0,44.0,4.982888,5.481177,14.450376,17.938398,499.78371,466.896646,3254.822724,2934.422998,0.0,0.0,...,26.907598,17.44011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946.0,35.0,2.989733,3.986311,24.416153,19.931554,514.234086,468.889802,3211.471595,2886.587269,0.0,0.0,...,45.842574,40.859685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
357.0,69.0,4.982888,4.4846,23.419576,22.422998,488.323066,476.862423,3240.870637,2880.607803,0.0,0.0,...,38.86653,31.890486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
602.0,21.0,5.481177,4.4846,19.433265,19.433265,538.65024,485.333333,3217.451061,2846.724162,0.0,0.0,...,24.914442,20.429843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
602.0,69.0,6.976044,3.986311,24.416153,18.934976,530.677618,457.429158,3190.543463,2917.481177,0.0,0.0,...,29.897331,24.416153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Aha! The raw output includes 2025, so there are 6 years, not 5 like in the count space data.

In [81]:
# 2 NTD states x 6 years x 2 sexes x 4 age groups x 3 FA coverage groups x 3 VA coverage groups
2*6*2*4*3*3

864

In [33]:
vad_output.columns('state_person_time')[-10:]

['susceptible_to_vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered',
 'susceptible_to_vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'vitamin_a_deficiency_person_time_in_2025_among_fem

In [34]:
ntd_output.columns('state_person_time')[-10:]

['susceptible_to_neural_tube_defects_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered',
 'susceptible_to_neural_tube_defects_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_male_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_female_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_male_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_female_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered',
 'neural_tube_defects_person_time_in_2025_among_female_in_a

In [42]:
vad_output.subdata('state_person_time').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,susceptible_to_diarrheal_diseases_person_time_in_2020_among_male_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_female_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_male_in_age_group_late_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_female_in_age_group_late_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_male_in_age_group_post_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_female_in_age_group_post_neonatal_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_male_in_age_group_1_to_4_folic_acid_unknown_vitamin_a_uncovered,susceptible_to_diarrheal_diseases_person_time_in_2020_among_female_in_age_group_1_to_4_folic_acid_unknown_vitamin_a_uncovered,diarrheal_diseases_person_time_in_2020_among_male_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,diarrheal_diseases_person_time_in_2020_among_female_in_age_group_early_neonatal_folic_acid_unknown_vitamin_a_uncovered,...,susceptible_to_vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,susceptible_to_vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_early_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_late_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_post_neonatal_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_male_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered,vitamin_a_deficiency_person_time_in_2025_among_female_in_age_group_1_to_4_folic_acid_covered_vitamin_a_effectively_covered
input_draw_number,random_seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
946.0,35.0,2.989733,3.488022,24.416153,19.931554,423.047228,385.675565,2617.013005,2325.514031,0.0,0.498289,...,25.91102,19.433265,0.0,0.0,0.0,0.0,1.494867,1.993155,3.488022,5.481177
357.0,35.0,7.474333,11.958932,20.429843,21.924709,424.043806,386.173854,2549.245722,2417.199179,0.0,0.0,...,18.934976,27.405886,0.0,0.0,0.0,0.0,0.498289,0.996578,2.491444,1.494867
946.0,21.0,9.467488,5.481177,23.917864,14.948665,427.033539,376.706366,2622.992471,2268.210815,0.0,0.0,...,26.907598,21.42642,0.0,0.0,0.0,0.0,1.993155,3.488022,4.4846,4.4846
357.0,45.0,7.474333,9.965777,22.422998,17.938398,432.016427,359.266256,2592.596851,2368.366872,0.0,0.0,...,34.880219,34.880219,0.0,0.0,0.0,0.0,5.481177,4.4846,4.4846,2.491444
357.0,69.0,0.996578,0.498289,3.986311,1.993155,53.815195,55.80835,389.661875,343.819302,0.0,0.0,...,86.702259,79.227926,0.0,0.0,0.0,0.0,4.4846,6.477755,5.979466,5.979466


## 7. Load state person time from count data

Ok, so it looks like the difference is that state person time for neural tube defects has been added in the second model, which is what we expect.

In [40]:
vad_spt = vad_data['state_person_time']
vad_spt.head()

Unnamed: 0,year,age_group,sex,cause,folic_acid_fortification_group,vitamin_a_fortification_group,measure,input_draw,scenario,value
0,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,baseline,0.0
1,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,folic_acid_fortification_scale_up,0.0
2,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,vitamin_a_fortification_scale_up,0.0
3,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,baseline,0.0
4,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,folic_acid_fortification_scale_up,0.0


In [64]:
vad_spt.shape

(25920, 10)

In [83]:
# Note there are only 5 years in count space data, but 6 in the raw output
print([vad_spt[col].nunique() for col in vad_spt.columns])

[5, 4, 2, 8, 3, 3, 1, 3, 3, 4025]


In [86]:
# Check that product of identification columns equals total length of dataframe
prod = 1
for col in vad_spt.columns[:-1]:
    prod *= vad_spt[col].nunique()
prod

25920

In [73]:
# 8 cause states, 3 input draws
5*4*2*8*3*3*1*3*3

25920

In [41]:
ntd_spt = ntd_data['state_person_time']
ntd_spt.head()

Unnamed: 0,year,age_group,sex,cause,folic_acid_fortification_group,vitamin_a_fortification_group,measure,input_draw,scenario,value
0,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,baseline,0.0
1,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,folic_acid_fortification_scale_up,0.0
2,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,vitamin_a_fortification_scale_up,0.0
3,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,baseline,0.0
4,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,folic_acid_fortification_scale_up,0.0


In [65]:
ntd_spt.shape

(32400, 10)

In [72]:
# 10 cause states, 3 input draws
5*4*2*10*3*3*1*3*3

32400

In [75]:
5*4*2*10*3*3 - 5*4*2*8*3*3

720

In [43]:
vad_spt.cause.unique()

array(['diarrheal_diseases', 'lower_respiratory_infections', 'measles',
       'susceptible_to_diarrheal_diseases',
       'susceptible_to_lower_respiratory_infections',
       'susceptible_to_measles', 'susceptible_to_vitamin_a_deficiency',
       'vitamin_a_deficiency'], dtype=object)

In [44]:
ntd_spt.cause.unique()

array(['diarrheal_diseases', 'lower_respiratory_infections', 'measles',
       'neural_tube_defects', 'susceptible_to_diarrheal_diseases',
       'susceptible_to_lower_respiratory_infections',
       'susceptible_to_measles', 'susceptible_to_neural_tube_defects',
       'susceptible_to_vitamin_a_deficiency', 'vitamin_a_deficiency'],
      dtype=object)

In [50]:
ntd_spt.vitamin_a_fortification_group.unique()

array(['covered', 'effectively_covered', 'uncovered'], dtype=object)

In [51]:
ntd_spt.folic_acid_fortification_group.unique()

array(['covered', 'uncovered', 'unknown'], dtype=object)

In [52]:
ntd_spt.age_group.unique()

array(['1_to_4', 'early_neonatal', 'late_neonatal', 'post_neonatal'],
      dtype=object)

In [54]:
ntd_spt.year.unique()

array(['2020', '2021', '2022', '2023', '2024'], dtype=object)

In [57]:
ntd_spt.query('folic_acid_fortification_group == "unknown"').cause.unique()

array(['diarrheal_diseases', 'lower_respiratory_infections', 'measles',
       'neural_tube_defects', 'susceptible_to_diarrheal_diseases',
       'susceptible_to_lower_respiratory_infections',
       'susceptible_to_measles', 'susceptible_to_neural_tube_defects',
       'susceptible_to_vitamin_a_deficiency', 'vitamin_a_deficiency'],
      dtype=object)

## 8. Compute VAD prevalence

There seems to be a problem where the prevalences of all states for a given disease add up to more than 1.

In [89]:
vad_pt = vad_data['person_time']
vad_pt.head()

Unnamed: 0,year,age_group,sex,folic_acid_fortification_group,vitamin_a_fortification_group,measure,input_draw,scenario,value
0,2020,1_to_4,female,covered,covered,person_time,357,baseline,0.0
1,2020,1_to_4,female,covered,covered,person_time,357,folic_acid_fortification_scale_up,0.0
2,2020,1_to_4,female,covered,covered,person_time,357,vitamin_a_fortification_scale_up,0.0
3,2020,1_to_4,female,covered,covered,person_time,602,baseline,0.0
4,2020,1_to_4,female,covered,covered,person_time,602,folic_acid_fortification_scale_up,0.0


### Divide state person time by person time to get prevalence

Stratify only by year in both numerator and denominator (i.e. aggregate over all other stratification variables), and broadcast over cause states to compute all prevalences at once. Use a multiplier of 1 to just get ratios in [0,1].

In [95]:
prevalence = lop.rate_or_ratio(vad_spt, vad_pt, ['year'], ['year'], 1, ['cause'])
prevalence.head()

Unnamed: 0,year,input_draw,scenario,cause,value
0,2020,357,baseline,diarrheal_diseases,0.174887
1,2020,357,baseline,lower_respiratory_infections,0.029063
2,2020,357,baseline,measles,0.004404
3,2020,357,baseline,susceptible_to_diarrheal_diseases,0.851862
4,2020,357,baseline,susceptible_to_lower_respiratory_infections,0.997686


In [107]:
prevalence.year.unique()

array(['2020', '2021', '2022', '2023', '2024'], dtype=object)

### The prevalence of VAD in the "susceptible" and "with condition" groups adds to more than 1

I think that shouldn't happen, unless there's something unintuitive happening either (1) with the aggregation over strata or (2) with the fact that we're using ratios of person time to get average point prevalences. I don't think either of those things should change the fact that the prevalences should add up to 1, but I'd have to think about it more carefully to be sure. 

In [103]:
prevalence[prevalence.cause.str.contains('vitamin_a_deficiency')].head()

Unnamed: 0,year,input_draw,scenario,cause,value
6,2020,357,baseline,susceptible_to_vitamin_a_deficiency,0.736274
7,2020,357,baseline,vitamin_a_deficiency,0.290475
14,2020,357,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.736274
15,2020,357,folic_acid_fortification_scale_up,vitamin_a_deficiency,0.290475
22,2020,357,vitamin_a_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.863951


In [132]:
vad_prevalence = prevalence[prevalence.cause.str.contains('vitamin_a_deficiency')]
vad_prevalence.head()

Unnamed: 0,year,input_draw,scenario,cause,value
8,2020,357,baseline,susceptible_to_vitamin_a_deficiency,0.736274
9,2020,357,baseline,vitamin_a_deficiency,0.290475
18,2020,357,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.736274
19,2020,357,folic_acid_fortification_scale_up,vitamin_a_deficiency,0.290475
28,2020,357,vitamin_a_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.863951


### Since there are only 3 draws, we can unstack them to see all the values

In [135]:
vad_prevalence.set_index(['year', 'input_draw', 'scenario', 'cause']).unstack('input_draw')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,input_draw,357,602,946
year,scenario,cause,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020,baseline,susceptible_to_vitamin_a_deficiency,0.736274,0.701943,0.716862
2020,baseline,vitamin_a_deficiency,0.290475,0.325787,0.309182
2020,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.736274,0.701943,0.716862
2020,folic_acid_fortification_scale_up,vitamin_a_deficiency,0.290475,0.325787,0.309182
2020,vitamin_a_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.863951,0.812798,0.76418
2020,vitamin_a_fortification_scale_up,vitamin_a_deficiency,0.16249,0.214571,0.261765
2021,baseline,susceptible_to_vitamin_a_deficiency,0.700229,0.662786,0.68244
2021,baseline,vitamin_a_deficiency,0.291927,0.329272,0.310616
2021,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,0.700229,0.662786,0.68244
2021,folic_acid_fortification_scale_up,vitamin_a_deficiency,0.291927,0.329272,0.310616


In [112]:
vad_prevalence.year.unique()

array(['2020', '2021', '2022', '2023', '2024'], dtype=object)

### Using describe, the min, median, and max will be the values of the 3 draws

It looks like the VA scaleup decreased prevalence of VAD, which is good.

In [133]:
vad_prevalence.groupby(['year', 'scenario', 'cause']).value.describe(percentiles=[])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,50%,max
year,scenario,cause,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,baseline,susceptible_to_vitamin_a_deficiency,3.0,0.71836,0.017214,0.701943,0.716862,0.736274
2020,baseline,vitamin_a_deficiency,3.0,0.308481,0.017667,0.290475,0.309182,0.325787
2020,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,3.0,0.71836,0.017214,0.701943,0.716862,0.736274
2020,folic_acid_fortification_scale_up,vitamin_a_deficiency,3.0,0.308481,0.017667,0.290475,0.309182,0.325787
2020,vitamin_a_fortification_scale_up,susceptible_to_vitamin_a_deficiency,3.0,0.813643,0.049891,0.76418,0.812798,0.863951
2020,vitamin_a_fortification_scale_up,vitamin_a_deficiency,3.0,0.212942,0.049657,0.16249,0.214571,0.261765
2021,baseline,susceptible_to_vitamin_a_deficiency,3.0,0.681818,0.018729,0.662786,0.68244,0.700229
2021,baseline,vitamin_a_deficiency,3.0,0.310605,0.018673,0.291927,0.310616,0.329272
2021,folic_acid_fortification_scale_up,susceptible_to_vitamin_a_deficiency,3.0,0.681818,0.018729,0.662786,0.68244,0.700229
2021,folic_acid_fortification_scale_up,vitamin_a_deficiency,3.0,0.310605,0.018673,0.291927,0.310616,0.329272


In [121]:
vad_prevalence.cause.unique()

array(['vitamin_a_deficiency'], dtype=object)

## 9. Compute prevalence of neural tube defects

1. As in the VAD output, there seems to be a problem where the prevalences of all states for a given disease add up to more than 1.
2. Also, there is at least one state whose prevalence all by itselfs is greater than 1.
3. It appears that FA fortification scaleup didn't have an effect - the numbers are the same in the baseline and intervention scenarios.

In [126]:
ntd_spt.head()

Unnamed: 0,year,age_group,sex,cause,folic_acid_fortification_group,vitamin_a_fortification_group,measure,input_draw,scenario,value
0,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,baseline,0.0
1,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,folic_acid_fortification_scale_up,0.0
2,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,357,vitamin_a_fortification_scale_up,0.0
3,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,baseline,0.0
4,2020,1_to_4,female,diarrheal_diseases,covered,covered,person_time,602,folic_acid_fortification_scale_up,0.0


In [125]:
ntd_pt = ntd_data['person_time']
ntd_pt.head()

Unnamed: 0,year,age_group,sex,folic_acid_fortification_group,vitamin_a_fortification_group,measure,input_draw,scenario,value
0,2020,1_to_4,female,covered,covered,person_time,357,baseline,0.0
1,2020,1_to_4,female,covered,covered,person_time,357,folic_acid_fortification_scale_up,0.0
2,2020,1_to_4,female,covered,covered,person_time,357,vitamin_a_fortification_scale_up,0.0
3,2020,1_to_4,female,covered,covered,person_time,602,baseline,0.0
4,2020,1_to_4,female,covered,covered,person_time,602,folic_acid_fortification_scale_up,0.0


### Divide state person time by person time to get prevalence

Stratify only by year in both numerator and denominator (i.e. aggregate over all other stratification variables), and broadcast over cause states to compute all prevalences at once. Use a multiplier of 1 to just get ratios in [0,1].

In [117]:
prevalence = lop.rate_or_ratio(ntd_spt, ntd_pt, ['year'], ['year'], 1, ['cause'])
prevalence.head()

Unnamed: 0,year,input_draw,scenario,cause,value
0,2020,357,baseline,diarrheal_diseases,0.174887
1,2020,357,baseline,lower_respiratory_infections,0.029063
2,2020,357,baseline,measles,0.004404
3,2020,357,baseline,neural_tube_defects,0.000516
4,2020,357,baseline,susceptible_to_diarrheal_diseases,0.851862


### Prevalences of "with condition" and "susceptible" add up to more than 1 for diarrheal diseases

Same problem we had with VAD above. This looks like a consistent pattern.

In [128]:
prevalence[prevalence.cause.str.contains('diarrheal_diseases')].head()

Unnamed: 0,year,input_draw,scenario,cause,value
0,2020,357,baseline,diarrheal_diseases,0.174887
4,2020,357,baseline,susceptible_to_diarrheal_diseases,0.851862
10,2020,357,folic_acid_fortification_scale_up,diarrheal_diseases,0.174887
14,2020,357,folic_acid_fortification_scale_up,susceptible_to_diarrheal_diseases,0.851862
20,2020,357,vitamin_a_fortification_scale_up,diarrheal_diseases,0.158537


### Prevalence of "susceptible to neural tube defects" by itself is greater than 1 in draw 357 -- that's no good

In [122]:
ntd_prevalence = prevalence[prevalence.cause.str.contains('neural_tube_defects')]
ntd_prevalence.head()

Unnamed: 0,year,input_draw,scenario,cause,value
3,2020,357,baseline,neural_tube_defects,0.000516
7,2020,357,baseline,susceptible_to_neural_tube_defects,1.026232
13,2020,357,folic_acid_fortification_scale_up,neural_tube_defects,0.000516
17,2020,357,folic_acid_fortification_scale_up,susceptible_to_neural_tube_defects,1.026232
23,2020,357,vitamin_a_fortification_scale_up,neural_tube_defects,0.000516


In [131]:
ntd_prevalence.groupby(['year', 'scenario', 'cause']).value.describe(percentiles=[])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,50%,max
year,scenario,cause,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,baseline,neural_tube_defects,3.0,0.000519,3.4e-05,0.000487,0.000516,0.000554
2020,baseline,susceptible_to_neural_tube_defects,3.0,1.026322,0.000814,1.025557,1.026232,1.027177
2020,folic_acid_fortification_scale_up,neural_tube_defects,3.0,0.000519,3.4e-05,0.000487,0.000516,0.000554
2020,folic_acid_fortification_scale_up,susceptible_to_neural_tube_defects,3.0,1.026322,0.000814,1.025557,1.026232,1.027177
2020,vitamin_a_fortification_scale_up,neural_tube_defects,3.0,0.000519,3.4e-05,0.000487,0.000516,0.000554
2020,vitamin_a_fortification_scale_up,susceptible_to_neural_tube_defects,3.0,1.026066,0.000689,1.025458,1.025925,1.026815
2021,baseline,neural_tube_defects,3.0,0.000451,2.7e-05,0.000428,0.000444,0.00048
2021,baseline,susceptible_to_neural_tube_defects,3.0,0.991972,0.000571,0.991578,0.991712,0.992627
2021,folic_acid_fortification_scale_up,neural_tube_defects,3.0,0.000451,2.7e-05,0.000428,0.000444,0.00048
2021,folic_acid_fortification_scale_up,susceptible_to_neural_tube_defects,3.0,0.991972,0.000571,0.991578,0.991712,0.992627
