In [1]:
import pandas as pd, numpy as np
# import matplotlib.pyplot as plt
import lsff_output_processing as lop

!whoami
!date

ndbs
Fri Mar 27 14:53:51 PDT 2020


In [2]:
%load_ext autoreload
%autoreload 2

## 1. Load all count space data tables and see what they are

In [3]:
base_directory = '/share/costeffectiveness/results/vivarium_conic_lsff/v4.0_folic_acid_fortification'

locations_rundates = {
    'Ethiopia': '2020_03_18_23_04_26',
    'India': '2020_03_18_23_04_36',
    'Nigeria': '2020_03_18_23_04_42',
}

In [5]:
data = lop.load_all_transformed_count_data(base_directory, locations_rundates)

In [6]:
[k for k in data.keys() if k[0] == 'ethiopia']

[('ethiopia', 'gestational_age'),
 ('ethiopia', 'transition_count'),
 ('ethiopia', 'deaths'),
 ('ethiopia', 'state_person_time'),
 ('ethiopia', 'births_with_ntd'),
 ('ethiopia', 'population'),
 ('ethiopia', 'person_time'),
 ('ethiopia', 'ylls'),
 ('ethiopia', 'ylds'),
 ('ethiopia', 'births'),
 ('ethiopia', 'birth_weight')]

## 2. Get data to compute NTD birth prevalence in Ethiopia

### First load NTD births and do a quick data check

In [8]:
data[('ethiopia', 'births_with_ntd')].head()

Unnamed: 0,year,sex,fortification_group,measure,input_draw,scenario,value
0,2020,female,False,live_births_with_ntds,21,baseline,42.0
1,2020,female,False,live_births_with_ntds,21,folic_acid_fortification_scale_up,40.0
2,2020,female,False,live_births_with_ntds,29,baseline,38.0
3,2020,female,False,live_births_with_ntds,29,folic_acid_fortification_scale_up,38.0
4,2020,female,False,live_births_with_ntds,55,baseline,27.0


In [71]:
data[('ethiopia', 'births_with_ntd')].groupby(['scenario']).value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
baseline,750.0,24.166667,36.209673,0.0,0.0,0.0,58.0,121.0
folic_acid_fortification_scale_up,750.0,22.569333,29.991763,0.0,0.0,5.0,46.0,107.0


### Now load all live births and do a quick data check

In [9]:
data[('ethiopia', 'births')].head()

Unnamed: 0,year,sex,fortification_group,measure,input_draw,scenario,value
0,2020,female,False,live_births,21,baseline,21508.0
1,2020,female,False,live_births,21,folic_acid_fortification_scale_up,20577.0
2,2020,female,False,live_births,29,baseline,21827.0
3,2020,female,False,live_births,29,folic_acid_fortification_scale_up,20978.0
4,2020,female,False,live_births,55,baseline,21611.0


In [72]:
data[('ethiopia', 'births')].groupby(['scenario']).value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
baseline,750.0,13401.716,19519.8375,0.0,0.0,1.0,42783.5,46484.0
folic_acid_fortification_scale_up,750.0,13401.716,16103.652463,0.0,0.0,4788.0,29566.5,46042.0


### See how many draws there are (25)

In [67]:
data[('ethiopia', 'births')].input_draw.unique()

array([ 21,  29,  55,  78, 155, 223, 232, 254, 307, 357, 394, 417, 482,
       514, 524, 533, 602, 620, 629, 650, 674, 680, 733, 829, 946])

In [68]:
data[('ethiopia', 'births')].input_draw.nunique()

25

### These rows have no births, hence will result in NaN when computing prevalence

Note that there will be zero births with unknown fortification status since `'unknown'` only gets assigned to simulants alive at the beginning.

Sometimes there are also zero births with fortification status  = `'true'`. This makes sense because there are fewer people who are fortified than unfortified.

### Ahh, I realized later that I should be dividing by the total population, not the stratified population, so this becomes irrelevant, as there shouldn't be any draws with zero births overall

In [42]:
data[('ethiopia', 'births')].query('value == 0').head()

Unnamed: 0,year,sex,fortification_group,measure,input_draw,scenario,value
50,2020,female,True,live_births,21,baseline,0.0
52,2020,female,True,live_births,29,baseline,0.0
60,2020,female,True,live_births,223,baseline,0.0
64,2020,female,True,live_births,254,baseline,0.0
66,2020,female,True,live_births,307,baseline,0.0


In [43]:
data[('ethiopia', 'births')].query('value == 0').shape

(612, 7)

In [52]:
data[('ethiopia', 'births')].query('value == 0').fortification_group.unique()

array(['true', 'unknown'], dtype=object)

## 3. Divide NTD births by all live births to get NTD birth prevalence (per 1000 live births), take 1

#### This is the wrong way to do this -- the denominator is the stratified population, but it should be the total population. See below.

### Define index columns to line up data and divide

Omit the `'measure'` column because this just identifies the table; we need the values in all index columns to be the same between tables.

In [45]:
index_columns = ['year', 'sex', 'fortification_group', 'input_draw', 'scenario']

In [57]:
# Define index columns - do it again here so I can re-run the cell easily
index_columns = ['year', 'sex', 'fortification_group', 'input_draw', 'scenario']

# List all columns in order to standardize the data tables
all_columns_in_order = ['year', 'age_group', 'sex',
                        'cause','fortification_group', 'measure',
                        'input_draw', 'scenario', 'value'
                       ]

# Load data for NTD births and live births
ntd_births = data[('ethiopia', 'births_with_ntd')]
live_births = data[('ethiopia', 'births')]

# Set the index columns and pull out the value column
ntd_births = ntd_births.set_index(index_columns).value
live_births = live_births.set_index(index_columns).value

# Divide the two pandas Series to get birth prevalence
# Multiply by 1000 to get ntd births per 1000 live births
ntd_birth_prevalence =  1000 * ntd_births / live_births

# Drop rows where we divided by 0 because there were no births
ntd_birth_prevalence.dropna(inplace=True)

# Put data back in standard form
ntd_birth_prevalence = ntd_birth_prevalence.reset_index()
ntd_birth_prevalence['age_group'] = 'birth'
ntd_birth_prevalence['measure'] = 'prevalence'
ntd_birth_prevalence['cause'] = 'neural_tube_defects'
ntd_birth_prevalence = ntd_birth_prevalence[all_columns_in_order]

ntd_birth_prevalence.head()

Unnamed: 0,year,age_group,sex,cause,fortification_group,measure,input_draw,scenario,value
0,2020,birth,female,neural_tube_defects,False,prevalence,21,baseline,1.952762
1,2020,birth,female,neural_tube_defects,False,prevalence,21,folic_acid_fortification_scale_up,1.943918
2,2020,birth,female,neural_tube_defects,False,prevalence,29,baseline,1.740963
3,2020,birth,female,neural_tube_defects,False,prevalence,29,folic_acid_fortification_scale_up,1.811421
4,2020,birth,female,neural_tube_defects,False,prevalence,55,baseline,1.249364


In [58]:
ntd_birth_prevalence.shape

(888, 9)

## 4. Get separate dataframes for baseline and intervention, take 1

In [59]:
baseline_ntd_birth_prevalence = ntd_birth_prevalence.query('scenario == "baseline"')
intervention_ntd_birth_prevalence = ntd_birth_prevalence.query(
    'scenario == "folic_acid_fortification_scale_up"')

intervention_ntd_birth_prevalence.head()

Unnamed: 0,year,age_group,sex,cause,fortification_group,measure,input_draw,scenario,value
1,2020,birth,female,neural_tube_defects,False,prevalence,21,folic_acid_fortification_scale_up,1.943918
3,2020,birth,female,neural_tube_defects,False,prevalence,29,folic_acid_fortification_scale_up,1.811421
5,2020,birth,female,neural_tube_defects,False,prevalence,55,folic_acid_fortification_scale_up,1.203979
7,2020,birth,female,neural_tube_defects,False,prevalence,78,folic_acid_fortification_scale_up,1.736955
9,2020,birth,female,neural_tube_defects,False,prevalence,155,folic_acid_fortification_scale_up,1.877982


## 5. Calculate reduction in NTDs due to FA fortification, take 1

### This is no good... birth prevalence *increases* on average in the intervention scenario

In [61]:
index_columns = ['year', 'sex', 'fortification_group', 'input_draw']

baseline = baseline_ntd_birth_prevalence.set_index(index_columns).value
intervention = intervention_ntd_birth_prevalence.set_index(index_columns).value

delta_ntd_birth_prevalence = baseline - intervention
delta_ntd_birth_prevalence = delta_ntd_birth_prevalence.reset_index()
delta_ntd_birth_prevalence.head()

Unnamed: 0,year,sex,fortification_group,input_draw,value
0,2020,female,False,21,0.008844
1,2020,female,False,29,-0.070458
2,2020,female,False,55,0.045384
3,2020,female,False,78,0.114725
4,2020,female,False,155,-0.094309


### Change in birth prevalence, take 1

In [62]:
delta_ntd_birth_prevalence.groupby(['year', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
year,fortification_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020,False,50.0,-0.002132,0.152236,-0.340863,-0.071459,-0.00453,0.072496,0.392787
2020,True,28.0,-0.777194,1.089774,-2.711864,-1.371841,-0.860357,-0.413497,1.948959
2021,False,50.0,-0.041211,0.162825,-0.936257,-0.052708,0.00092,0.02463,0.120622
2021,True,29.0,-0.499957,0.958946,-1.605652,-1.110289,-0.74549,-0.387597,2.342993
2022,False,50.0,-0.008183,0.150442,-0.863992,-0.048877,0.001708,0.047203,0.324996
2022,True,28.0,-0.358286,1.594296,-1.670379,-1.269272,-1.037058,0.000277,4.387192
2023,False,50.0,0.03607,0.12242,-0.20787,-0.02245,0.013145,0.064808,0.573462
2023,True,26.0,-0.805746,0.54442,-1.745581,-1.160619,-0.827244,-0.503961,0.353953
2024,False,50.0,0.001474,0.083385,-0.280974,-0.049146,0.000151,0.05038,0.256157
2024,True,27.0,-0.316726,2.222321,-1.74482,-1.313443,-0.847314,-0.471629,9.961712


## 6. Check birth prevalence in baseline and intervention scenarios, take 1

### The prevalence is *higher* in the 'true' groups in intervention vs. baseline

Why in the world would that happen???

Note that in baseline, about 45% of the rows with `fortification_group=true` have been dropped because there were no births in this group.

### Hmm, could this be an example of Simpson's paradox? I think I should probably be aggregating over fortification group before computing the prevalence...

### Baseline, take 1:

In [63]:
baseline_ntd_birth_prevalence.groupby(['year', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
year,fortification_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020,False,50.0,1.770523,0.348001,0.785256,1.578786,1.784775,1.98564,2.642659
2020,True,28.0,0.372384,0.975955,0.0,0.0,0.0,0.0,3.577818
2021,False,50.0,1.807914,0.290128,1.155588,1.636544,1.840816,1.987157,2.311483
2021,True,29.0,0.474882,0.985602,0.0,0.0,0.0,0.0,3.427592
2022,False,50.0,1.810242,0.321526,1.112218,1.618111,1.787337,2.017057,2.521432
2022,True,28.0,0.729153,1.550346,0.0,0.0,0.0,0.236295,5.586592
2023,False,50.0,1.839144,0.324154,1.182197,1.658862,1.78883,1.980955,2.6372
2023,True,26.0,0.264758,0.568303,0.0,0.0,0.0,0.0,1.712329
2024,False,50.0,1.768315,0.295951,1.133892,1.567911,1.712076,1.900107,2.561272
2024,True,27.0,0.798858,2.184697,0.0,0.0,0.0,0.407498,10.869565


### Intervention, take 1

In [64]:
intervention_ntd_birth_prevalence.groupby(['year', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
year,fortification_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020,False,50.0,1.772656,0.380281,0.789633,1.516286,1.738711,2.072419,2.623754
2020,True,50.0,1.00927,0.893106,0.0,0.340287,0.934473,1.378793,3.802281
2021,False,50.0,1.849125,0.362352,1.101857,1.616221,1.84212,2.03296,3.059976
2021,True,50.0,0.844087,0.541967,0.0,0.578941,0.891288,1.149049,2.75634
2022,False,50.0,1.818425,0.359713,1.131222,1.532991,1.832713,1.991313,3.090447
2022,True,50.0,1.060309,0.558955,0.0,0.908697,1.106737,1.411865,2.318393
2023,False,50.0,1.803074,0.362444,1.098724,1.602626,1.776263,1.988668,2.751072
2023,True,50.0,1.131808,0.64881,0.0,0.783071,1.112779,1.464758,3.584229
2024,False,50.0,1.766841,0.29662,1.048312,1.578355,1.733595,1.963915,2.542573
2024,True,50.0,1.187002,0.915963,0.0,0.686636,1.009296,1.468426,5.277045


In [70]:
(50-29)/50

0.42

## 7. Recompute NTD prevalence, aggregating over fortification group and sex first

In [75]:
data[('ethiopia', 'births_with_ntd')].head()

Unnamed: 0,year,sex,fortification_group,measure,input_draw,scenario,value
0,2020,female,False,live_births_with_ntds,21,baseline,42.0
1,2020,female,False,live_births_with_ntds,21,folic_acid_fortification_scale_up,40.0
2,2020,female,False,live_births_with_ntds,29,baseline,38.0
3,2020,female,False,live_births_with_ntds,29,folic_acid_fortification_scale_up,38.0
4,2020,female,False,live_births_with_ntds,55,baseline,27.0


In [83]:
# Test agggregation
# Aggregate over sex and fortification group, and index by the following columns:
groupby_cols = ['year', 'scenario', 'input_draw']
data[('ethiopia', 'births_with_ntd')].groupby(groupby_cols).value.sum().head(30)

year  scenario                           input_draw
2020  baseline                           21             81.0
                                         29             75.0
                                         55             72.0
                                         78             89.0
                                         155            86.0
                                         223            87.0
                                         232            77.0
                                         254            49.0
                                         307            79.0
                                         357           100.0
                                         394            78.0
                                         417            72.0
                                         482            88.0
                                         514            86.0
                                         524            82.0
                                 

### Do the division after aggregating

In [102]:
# Define index columns - those we will NOT aggregate over
groupby_cols = ['year', 'scenario', 'input_draw']

# Load data for NTD births and live births
ntd_births = data[('ethiopia', 'births_with_ntd')]
live_births = data[('ethiopia', 'births')]

# Aggregate (sum) over sex and fortification_group
ntd_births = ntd_births.groupby(groupby_cols).value.sum()
live_births = live_births.groupby(groupby_cols).value.sum()

# Divide the two pandas Series to get birth prevalence
# Multiply by 1000 to get ntd births per 1000 live births
ntd_birth_prevalence =  1000 * ntd_births / live_births

# Reset the index to get a dataframe
ntd_birth_prevalence = ntd_birth_prevalence.reset_index()

# See what we got
ntd_birth_prevalence.head()

Unnamed: 0,year,scenario,input_draw,value
0,2020,baseline,21,1.823626
1,2020,baseline,29,1.673509
2,2020,baseline,55,1.612361
3,2020,baseline,78,1.983685
4,2020,baseline,155,1.901645


### Check shape and check that we have no NaN's this time

In [103]:
ntd_birth_prevalence.shape

(250, 4)

In [86]:
# Check whether any values are NaN (By definition, value is NaN iff value != value)
# No NaN's this time -- good
ntd_birth_prevalence.query('value != value')

Unnamed: 0,year,scenario,input_draw,value


## 8. Get separate baseline and intervention dataframes, take 2

In [88]:
baseline_ntd_birth_prevalence = ntd_birth_prevalence.query('scenario == "baseline"')
intervention_ntd_birth_prevalence = ntd_birth_prevalence.query(
    'scenario == "folic_acid_fortification_scale_up"')

intervention_ntd_birth_prevalence.head()

Unnamed: 0,year,scenario,input_draw,value
25,2020,folic_acid_fortification_scale_up,21,1.801112
26,2020,folic_acid_fortification_scale_up,29,1.673509
27,2020,folic_acid_fortification_scale_up,55,1.500392
28,2020,folic_acid_fortification_scale_up,78,1.805376
29,2020,folic_acid_fortification_scale_up,155,1.857421


## 9. Get stats across draws for baseline and intervention, take 2

In [89]:
baseline_ntd_birth_prevalence.groupby('year').value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,25.0,1.771674,0.240507,1.092311,1.617868,1.770308,1.920715,2.234237
2021,25.0,1.808447,0.185874,1.415476,1.692275,1.7994,1.91333,2.153081
2022,25.0,1.813058,0.193964,1.483713,1.673304,1.85728,1.929657,2.185241
2023,25.0,1.838943,0.208964,1.361003,1.713825,1.827088,1.94533,2.190962
2024,25.0,1.768331,0.208949,1.356274,1.612632,1.758197,1.817837,2.290938


In [90]:
intervention_ntd_birth_prevalence.groupby('year').value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,25.0,1.659874,0.235887,1.092311,1.500392,1.673509,1.853713,2.122526
2021,25.0,1.696948,0.177045,1.246967,1.573661,1.674398,1.802145,2.063834
2022,25.0,1.695644,0.172731,1.350773,1.578497,1.727871,1.843484,1.975353
2023,25.0,1.706696,0.250388,1.124796,1.607428,1.715016,1.855889,2.146019
2024,25.0,1.648974,0.241684,1.060895,1.525392,1.661704,1.762738,2.16801


## 10. Compute difference between baseline and intervention, take 2

In [91]:
index_columns = ['year', 'input_draw']

baseline = baseline_ntd_birth_prevalence.set_index(index_columns).value
intervention = intervention_ntd_birth_prevalence.set_index(index_columns).value

delta_ntd_birth_prevalence = baseline - intervention
delta_ntd_birth_prevalence = delta_ntd_birth_prevalence.reset_index()
delta_ntd_birth_prevalence.head()

Unnamed: 0,year,input_draw,value
0,2020,21,0.022514
1,2020,29,0.0
2,2020,55,0.11197
3,2020,78,0.178309
4,2020,155,0.044224


## 11. Get stats by year for the difference, take 2

### That's weird -- there's a decrease in 2020, but that year should be the same as baseline

With common random numbers, shouldn't the intervention scenario in 2020 be exactly the same as the baseline scenario, so the difference should be identically zero at the draw level? 

### But at least now the intervention has a positive effect -- that's a relief

### Change in birth prevalence, take 2:

In [92]:
delta_ntd_birth_prevalence.groupby('year').value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,25.0,0.111799,0.126056,0.0,0.022514,0.067092,0.155424,0.583117
2021,25.0,0.111498,0.117017,0.0,0.03358,0.078783,0.156778,0.491082
2022,25.0,0.117413,0.13473,0.0,0.033349,0.067432,0.123408,0.581747
2023,25.0,0.132248,0.133108,0.011249,0.033627,0.08944,0.179043,0.593691
2024,25.0,0.119356,0.128751,0.0,0.033265,0.066805,0.156956,0.569533


## 12. Try recomputing stratified results, but with correct denominator - this is take 3!

Woo hoo! Broadcasting!

In [94]:
data[('ethiopia', 'births_with_ntd')].head()

Unnamed: 0,year,sex,fortification_group,measure,input_draw,scenario,value
0,2020,female,False,live_births_with_ntds,21,baseline,42.0
1,2020,female,False,live_births_with_ntds,21,folic_acid_fortification_scale_up,40.0
2,2020,female,False,live_births_with_ntds,29,baseline,38.0
3,2020,female,False,live_births_with_ntds,29,folic_acid_fortification_scale_up,38.0
4,2020,female,False,live_births_with_ntds,55,baseline,27.0


In [104]:
# Define index columns - those we will NOT aggregate over to compute live births
groupby_cols = ['year', 'scenario', 'input_draw']

# These columns will be aggregated in live births but not NTD births
# The division will broadcast over these columns
broadcast_cols = ['sex', 'fortification_group']

# Load data for NTD births and live births
ntd_births = data[('ethiopia', 'births_with_ntd')]
live_births = data[('ethiopia', 'births')]

# Aggregate (sum) over sex and fortification_group to get total live births
live_births = live_births.groupby(groupby_cols).value.sum()

# Set the index of ntd_births to match that of live_births plus the broadcast columns
ntd_births = ntd_births.set_index(groupby_cols + broadcast_cols).value

# Divide the two pandas Series to get birth prevalence
# Multiply by 1000 to get ntd births per 1000 live births
ntd_birth_prevalence =  1000 * ntd_births / live_births

# Reset the index to get a dataframe
ntd_birth_prevalence = ntd_birth_prevalence.reset_index()

# See what we got
ntd_birth_prevalence.head()

Unnamed: 0,year,scenario,input_draw,sex,fortification_group,value
0,2020,baseline,21,female,false,0.945584
1,2020,baseline,21,female,true,0.0
2,2020,baseline,21,female,unknown,0.0
3,2020,baseline,21,male,false,0.878042
4,2020,baseline,21,male,true,0.0


### Check the shape and check that we havee no NaN's

In [100]:
ntd_birth_prevalence.shape

(1500, 6)

In [101]:
ntd_birth_prevalence.query('value != value')

Unnamed: 0,year,scenario,input_draw,sex,fortification_group,value


## 13. There are zero births with fortification group unknown, so get rid of these rows

In [107]:
ntd_birth_prevalence.query('fortification_group == "unknown"').value.sum()

0.0

In [109]:
ntd_birth_prevalence = ntd_birth_prevalence.query('fortification_group != "unknown"')
ntd_birth_prevalence.shape

(1000, 6)

## 14. Get separate baseline and intervention dataframes, take 3

In [110]:
baseline_ntd_birth_prevalence = ntd_birth_prevalence.query('scenario == "baseline"')
intervention_ntd_birth_prevalence = ntd_birth_prevalence.query(
    'scenario == "folic_acid_fortification_scale_up"')

intervention_ntd_birth_prevalence.head()

Unnamed: 0,year,scenario,input_draw,sex,fortification_group,value
150,2020,folic_acid_fortification_scale_up,21,female,False,0.900556
151,2020,folic_acid_fortification_scale_up,21,female,True,0.022514
153,2020,folic_acid_fortification_scale_up,21,male,False,0.855528
154,2020,folic_acid_fortification_scale_up,21,male,True,0.022514
156,2020,folic_acid_fortification_scale_up,29,female,False,0.847911


## 15. View stratified stats for baseline and intervention, take 3

### Baesline, take 3:

In [111]:
baseline_ntd_birth_prevalence.groupby(
    ['year', 'sex', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
year,sex,fortification_group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020,female,False,25.0,0.768716,0.152546,0.378965,0.694678,0.784965,0.873852,1.027749
2020,female,True,25.0,0.005384,0.016237,0.0,0.0,0.0,0.0,0.067283
2020,male,False,25.0,0.996672,0.139714,0.713346,0.878042,1.007726,1.073465,1.328649
2020,male,True,25.0,0.000901,0.004504,0.0,0.0,0.0,0.0,0.022519
2021,female,False,25.0,0.803094,0.131255,0.561697,0.672714,0.796545,0.918274,1.007016
2021,female,True,25.0,0.003125,0.009936,0.0,0.0,0.0,0.0,0.044608
2021,male,False,25.0,0.998206,0.111258,0.794628,0.93249,0.996217,1.081744,1.187037
2021,male,True,25.0,0.004022,0.012008,0.0,0.0,0.0,0.0,0.055805
2022,female,False,25.0,0.77537,0.123204,0.540309,0.711435,0.785326,0.844262,1.098224
2022,female,True,25.0,0.00493,0.01484,0.0,0.0,0.0,0.0,0.067125


### Intervention, take 3:

In [112]:
intervention_ntd_birth_prevalence.groupby(
    ['year', 'sex', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
year,sex,fortification_group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020,female,False,25.0,0.643533,0.176533,0.336413,0.532611,0.58146,0.81815,0.919674
2020,female,True,25.0,0.07962,0.087945,0.0,0.022232,0.044476,0.133153,0.358841
2020,male,False,25.0,0.850853,0.17391,0.403696,0.760405,0.849827,0.972935,1.140327
2020,male,True,25.0,0.085869,0.09219,0.0,0.0,0.067027,0.133153,0.31204
2021,female,False,25.0,0.69384,0.13525,0.381954,0.615502,0.707794,0.772348,0.921915
2021,female,True,25.0,0.062675,0.069578,0.0,0.011228,0.044623,0.101105,0.301346
2021,male,False,25.0,0.852232,0.175039,0.479921,0.736607,0.828185,0.976048,1.154825
2021,male,True,25.0,0.088201,0.091979,0.0,0.011207,0.056141,0.157275,0.357151
2022,female,False,25.0,0.65756,0.113326,0.393976,0.615392,0.668993,0.720153,0.821693
2022,female,True,25.0,0.074813,0.076461,0.0,0.022377,0.044894,0.112142,0.302061


## 16. Compute the difference between baseline and intervention, take 3

In [113]:
index_columns = ['year', 'input_draw', 'sex', 'fortification_group']

baseline = baseline_ntd_birth_prevalence.set_index(index_columns).value
intervention = intervention_ntd_birth_prevalence.set_index(index_columns).value

delta_ntd_birth_prevalence = baseline - intervention
delta_ntd_birth_prevalence = delta_ntd_birth_prevalence.reset_index()
delta_ntd_birth_prevalence.head()

Unnamed: 0,year,input_draw,sex,fortification_group,value
0,2020,21,female,False,0.045028
1,2020,21,female,True,-0.022514
2,2020,21,male,False,0.022514
3,2020,21,male,True,-0.022514
4,2020,29,female,False,0.0


### View stratified stats for difference, take 3

Interesting... all of the mean differences in the fortified groups are negative, whereas when we aggregate (see below), the overall differences are positive.

In [114]:
delta_ntd_birth_prevalence.groupby(
    ['year', 'sex', 'fortification_group']).value.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
year,sex,fortification_group,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020,female,False,25.0,0.125183,0.130505,0.0,0.022364,0.066714,0.201284,0.448551
2020,female,True,25.0,-0.074236,0.075801,-0.291558,-0.133153,-0.044476,-0.022232,0.0
2020,male,False,25.0,0.14582,0.158761,0.0,0.044674,0.089738,0.223639,0.717682
2020,male,True,25.0,-0.084968,0.090873,-0.31204,-0.133153,-0.067027,0.0,0.0
2021,female,False,25.0,0.109253,0.106384,0.0,0.022627,0.078125,0.179175,0.457599
2021,female,True,25.0,-0.059549,0.064855,-0.279024,-0.089216,-0.044623,-0.011228,0.0
2021,male,False,25.0,0.145973,0.147871,0.0,0.04443,0.089548,0.247147,0.613853
2021,male,True,25.0,-0.084179,0.082628,-0.301346,-0.157275,-0.056141,-0.011207,0.0
2022,female,False,25.0,0.11781,0.124428,0.0,0.03345,0.067432,0.146334,0.503435
2022,female,True,25.0,-0.069884,0.067824,-0.234936,-0.100747,-0.044894,-0.022377,0.0


## 17. Check that if we aggregate the stratified difference df (take 3), we get the same stats as before (take 2)

Yes, it looks the same.

In [129]:
delta_ntd_birth_prevalence.groupby(
    ['year', 'input_draw']
).value.sum().reset_index().groupby('year').value.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,25.0,0.111799,0.126056,-7.285839000000001e-17,0.022514,0.067092,0.155424,0.583117
2021,25.0,0.111498,0.117017,0.0,0.03358,0.078783,0.156778,0.491082
2022,25.0,0.117413,0.13473,-6.245005e-17,0.033349,0.067432,0.123408,0.581747
2023,25.0,0.132248,0.133108,0.01124897,0.033627,0.08944,0.179043,0.593691
2024,25.0,0.119356,0.128751,2.0816680000000002e-17,0.033265,0.066805,0.156956,0.569533
