In [22]:
import pandas as pd, numpy as np
from db_queries import get_ids, get_outputs, get_location_metadata
from get_draws.api import get_draws
import scipy.stats as sp

# Notebook overview

The purpose of this notebook is to attempt to recreate the methodology in calculating the iron deficiency risk factor as well as the PAF for maternal disorders/iron deficiency risk factor

The main source of this methodology was the code repository here: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse

Specific files and functions from this repo are referenced when relevant

This notebook has the following sections:

1. What is the "normal" hemoglobin value?
    - This needs to be investigated before exposure or PAF recreation
2. Exposure recreation
3. PAF recreation

In [6]:
# define relevant age group and location ids
ages = list(range(7,16))
locs = 161

# just bangladesh for now so code is faster
#[165, 161, 214, 182] # pakistan, bangladesh, nigeria, malawi

# note: always want sex_id=2

# 1. What is the "normal" hemoglobin value? 

This section is intended to investigate the "normal_hgb" values that are used in the GBD iron deficiency code that is then used in the recreation of the exposure

See relevant notebooks here: 

- Calculate normal HBG: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/calculate_normal_hgb.py

- Compile normal HBG: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/compile_normal_hgb.py

In [40]:
# define the hgb file path that is used at the beginning of calculate_normal_hgb.py
# NOTE: this file is a recreation of the mean population hemoglobin value 
    # see https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/make_new_hgb_file.py
    
hgb_file = '/ihme/scratch/users/jab0412/anemia_causal_attribution/gbd_round6/step4/runV2/hgb_d.h5'
idcols = ['location_id', 'year_id', 'age_group_id', 'sex_id']

In [41]:
# this function comes from calculate_normal_hgb

def get_normal_hgb(year_id, sex_id, age_group_id):
    hb_est = pd.read_hdf(
                hgb_file,
                where="year_id==%s & sex_id==%s & age_group_id==%s" % (year_id, sex_id, age_group_id))

    for d in list(range(1000)):
        hb_est['hgb_pop_normal_{}'.format(d)] = sp.scoreatpercentile(hb_est['hgb_{}'.format(d)], 95)
        hb_est['draw_{}'.format(d)] = hb_est[['hgb_{}'.format(d), 'hgb_pop_normal_{}'.format(d)]].max(axis=1)

    names = [c for c in list(hb_est) if (c[:4] == 'draw')] + idcols
    cf_hb = hb_est[names]
    
    # I commented out conversion to csv
    #cf_hb.to_csv('/ihme/scratch/users/jab0412/anemia_causal_attribution/gbd_round6/step4/runV2/cf_files/%s_%s_%s.csv' % (year_id, sex_id, age_group_id))
    
    # instead I returned the cf_hb dataframe for examination in this notebook
    return cf_hb

In [42]:
# get normal hgb for ages and years of interest using JAB's function
normal_hgb = pd.DataFrame()
for a in ages:
    temp = get_normal_hgb(2019, 2, a)
    normal_hgb = normal_hgb.append(temp)
normal_hgb.head()

Unnamed: 0,draw_0,draw_1,draw_2,draw_3,draw_4,draw_5,draw_6,draw_7,draw_8,draw_9,...,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999,location_id,year_id,age_group_id,sex_id
1941136,144.414119,141.557344,144.895842,144.192577,144.455934,142.553863,142.629199,145.196461,143.542125,143.47654,...,143.085189,144.359127,142.752477,143.047014,143.170331,142.887473,1,2019,7,2
1941182,144.414119,141.557344,144.895842,144.192577,144.455934,142.553863,142.629199,145.196461,143.542125,143.47654,...,143.085189,144.359127,142.752477,143.047014,143.170331,142.887473,10,2019,7,2
1941228,144.414119,141.557344,144.895842,144.192577,144.455934,142.553863,142.629199,145.196461,143.542125,143.47654,...,143.085189,144.359127,142.752477,143.047014,143.170331,142.887473,100,2019,7,2
1941274,144.414119,143.406398,144.895842,144.192577,144.455934,142.553863,142.629199,145.196461,143.542125,143.47654,...,143.085189,145.862413,142.752477,143.047014,143.170331,142.887473,101,2019,7,2
1941320,144.414119,141.557344,144.895842,144.192577,144.455934,142.553863,142.629199,145.196461,143.542125,143.47654,...,143.085189,144.359127,142.752477,143.047014,143.170331,142.887473,102,2019,7,2


In [43]:
# select locations of interest and display mean values
normal = normal_hgb.loc[normal_hgb.location_id==locs]
normal = normal.set_index([c for c in normal.columns if 'draw' not in c])
pd.DataFrame(normal.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
location_id,year_id,age_group_id,sex_id,Unnamed: 4_level_1
161,2019,7,2,143.428059
161,2019,8,2,144.06506
161,2019,9,2,141.501402
161,2019,10,2,140.452812
161,2019,11,2,140.091244
161,2019,12,2,140.00771
161,2019,13,2,139.799311
161,2019,14,2,141.222556
161,2019,15,2,141.149991


In [44]:
# get risk tmrel for iron def. risk factor
tmrel = get_draws('rei_id',
                 95,
                 source='tmrel',
                 location_id=locs,
                 age_group_id=ages,
                 sex_id=2,
                 year_id=2019,
                 gbd_round_id=6,
                 decomp_step='step4'
                 )
tmrel_prepped = tmrel.set_index(['location_id','age_group_id']).sort_index()
tmrel = tmrel_prepped.drop(columns=[c for c in tmrel_prepped.columns if 'draw' not in c])
pd.DataFrame(tmrel.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,143.428059
161,8,144.06506
161,9,141.501402
161,10,140.452812
161,11,140.091244
161,12,140.00771
161,13,139.799311
161,14,141.222556
161,15,141.149991


In [45]:
normal_error = normal - tmrel
pd.DataFrame(normal_error.mean(axis=1))

# approx. equal!

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
location_id,age_group_id,sex_id,year_id,Unnamed: 4_level_1
161,7,2,2019,4.547474e-16
161,8,2,2019,-1.98952e-16
161,9,2,2019,-1.705303e-16
161,10,2,2019,3.979039e-16
161,11,2,2019,-6.536993e-16
161,12,2,2019,-8.81073e-16
161,13,2,2019,-1.136868e-16
161,14,2,2019,-4.831691e-16
161,15,2,2019,-5.400125e-16


In [47]:
# for reference
def load_raw_hgb_file(year_id, sex_id, age_group_id):
    hb_est = pd.read_hdf(
                hgb_file,
                where="year_id==%s & sex_id==%s & age_group_id==%s" % (year_id, sex_id, age_group_id))
    return hb_est

In [48]:
hbg = load_raw_hgb_file(2019, 2, 8)
hbg.head()

# returns all locations!!!

Unnamed: 0,age_group_id,hgb_0,hgb_1,hgb_10,hgb_100,hgb_101,hgb_102,hgb_103,hgb_104,hgb_105,...,hgb_996,hgb_997,hgb_998,hgb_999,location_id,metric_id,sex_id,year_id,hgb_mean,mean_hgb
1941137,8,126.800671,127.296138,127.294612,127.28686,127.12499,127.63418,127.424384,127.083429,127.078875,...,127.50901,127.530651,127.424075,127.593885,1,3,2,2019,127.311548,127.311548
1941183,8,120.951892,122.223625,120.958392,128.113481,124.901631,124.136395,119.230332,121.671543,124.942115,...,122.218205,122.134043,124.703358,125.395539,10,3,2,2019,121.786976,121.786976
1941229,8,136.43558,133.856935,134.95411,132.921559,134.299958,136.399689,137.867238,132.320584,133.720063,...,136.435643,131.998176,133.98472,137.358236,100,3,2,2019,134.240382,134.240382
1941275,8,136.702814,141.891961,136.650539,134.501396,134.149488,139.448067,134.97164,132.548972,137.61212,...,138.959322,141.552781,137.186665,139.402321,101,3,2,2019,137.747121,137.747121
1941321,8,136.411482,133.106478,134.796504,132.773303,134.314453,136.114885,138.137918,132.298567,133.356822,...,136.200921,131.105321,133.685276,137.167807,102,3,2,2019,133.913007,133.913007


In [50]:
len(np.unique(hbg['location_id']))

1082

## 1. Conclusions

The TMREL appears to be defined in a different way than expected based on the GBD 2019 methods appendix.

The methods appendix implies that the TMREL is the counterfactual mean hemoglobin value in the absence of iron deficiency (iron responsive causes). It is implied that this could be calculated by adding the cause-specific shifts for each cause multiplied by the prevalence of that cause to the population mean hemoglobin.

    Direct text from methods appendix: "The implied mean haemoglobin in the absence of iron deficiency is the theoretical minimum risk exposure level. This was calculated by summing cause-specific haemoglobin shift times prevalence for all causes classified as iron-responsive and then adding that sum to the the observed haemoglobin concentration for each population group."

However, the code implies that the TMREL is the 95th percentile of GLOBAL population mean hemoglobin estimates

# 2. Exposure value recreation

The main resource for this section can be found here:

https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/nutrition_iron_exposure.py


In [7]:
# these are the output MEIDs for all of the post-cause attribution process EXCEPT nutirtion_iron 
# NOTE: both iron responsive and iron non-responsive are included

# this is based on the code used here: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/nutrition_iron_exposure.py
# meid list = all_but_ni

# meids are found here: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/in_out_meid_map.xlsx

out_meids = [23376,23381,23386,23377,23382,23387,23378,23383,23388,23379,23384,23389,1929,1930,1931,1925,1926,1927,2131,2132,2133,2065,2066,2067,2082,2083,2084,2113,2114,2115,2506,2507,2508,2119,2120,2121,2492,2493,2494,2495,2496,2497,2498,2499,2500,2502,2503,2504,2475,2476,2477,2478,2479,2480,2481,2482,2483,2485,2486,2487,2489,2490,2491,1666,1667,1668,19391,19392,19393,19395,19396,19397,19399,19400,19401,16314,16315,16316,1538,1539,1540,1522,1523,1524,1532,1533,1534,1476,1477,1478,19737,19738,19739,19798,19799,19800,18864,18865,18866,18861,18862,18863]

In [8]:
# this function takes > one hour
# consider adding "n_draws=100, downsample=True" (although it will still take a long time)

# get prevalence for the listed meids

prev_out_meids = get_draws('modelable_entity_id',
                out_meids,
                source='epi',
                location_id=locs,
                age_group_id=ages,
                measure_id=5,
                metric_id=3,
                sex_id=2,
                year_id=2019,
                gbd_round_id=6,
                decomp_step='step4')
prev_out_meids.head()

Unnamed: 0,age_group_id,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,...,draw_997,draw_998,draw_999,location_id,measure_id,metric_id,modelable_entity_id,sex_id,year_id,model_version_id
0,7,0.000119,0.000161,0.000233,0.000195,0.000126,0.000137,0.000182,0.000187,0.000228,...,0.000135,0.000162,0.000194,161,5,3,23376,2,2019,477179
1,8,0.001254,0.002088,0.002528,0.001723,0.002338,0.002029,0.002096,0.002056,0.00214,...,0.001576,0.001681,0.002354,161,5,3,23376,2,2019,477179
2,9,0.001761,0.001499,0.001818,0.001598,0.001864,0.001806,0.001756,0.001684,0.002094,...,0.001615,0.00164,0.002358,161,5,3,23376,2,2019,477179
3,10,0.002497,0.0019,0.002039,0.002067,0.002307,0.002056,0.001768,0.002176,0.002752,...,0.002137,0.001811,0.002537,161,5,3,23376,2,2019,477179
4,11,0.003643,0.001885,0.003595,0.003778,0.003384,0.003501,0.00116,0.002128,0.003261,...,0.003197,0.002392,0.002014,161,5,3,23376,2,2019,477179


In [10]:
# get summed prevalence of all out_meids
prev_out_meids_prepped = prev_out_meids.set_index(['location_id','age_group_id'])
prev_out_meids_prepped = prev_out_meids_prepped.drop(columns=[c for c in prev_out_meids_prepped.columns if 'draw' not in c]).reset_index()
prev_out_meids_prepped = prev_out_meids_prepped.groupby(['location_id','age_group_id']).sum()
prev_out_meids_prepped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,draw_106,...,draw_990,draw_991,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999
location_id,age_group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
161,7,0.033643,0.030452,0.052822,0.040661,0.017504,0.020042,0.030547,0.046441,0.043081,0.013264,...,0.01276,0.040225,0.032009,0.039011,0.029637,0.026765,0.011013,0.026279,0.039433,0.030318
161,8,0.258993,0.175318,0.203707,0.195563,0.201525,0.221054,0.199382,0.212728,0.193495,0.148996,...,0.128737,0.184472,0.169736,0.132544,0.229969,0.184178,0.210443,0.222125,0.214408,0.165868
161,9,0.258855,0.263475,0.227584,0.224026,0.270875,0.21261,0.211004,0.237706,0.238578,0.265792,...,0.181383,0.224206,0.230482,0.231394,0.223914,0.181894,0.192972,0.246317,0.242226,0.197816
161,10,0.198068,0.197792,0.266979,0.22452,0.204767,0.237897,0.234428,0.205364,0.197251,0.262212,...,0.242865,0.239037,0.196298,0.200839,0.226485,0.215439,0.20748,0.202299,0.248412,0.222066
161,11,0.200105,0.220156,0.204572,0.142023,0.168391,0.170253,0.183018,0.212872,0.227689,0.212666,...,0.268122,0.186408,0.285276,0.1908,0.232297,0.201967,0.154692,0.189098,0.218735,0.242239


In [28]:
pd.DataFrame(prev_out_meids_prepped.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,0.036926
161,8,0.191546
161,9,0.215245
161,10,0.21828
161,11,0.216189
161,12,0.125846
161,13,0.111519
161,14,0.101191
161,15,0.108894


In [12]:
# get TMREL (hemoglobin conc. if no iron deficiency)
# NOTE: this is the same as "normal_df" in the iron deficiency risk factor GBD code
    # this was verified in a different notebook (check_normal_hbg)
tmrel = get_draws('rei_id',
                 95,
                 source='tmrel',
                 location_id=locs,
                 age_group_id=ages,
                 sex_id=2,
                 year_id=2019,
                 gbd_round_id=6,
                 decomp_step='step4'
                 )

tmrel_prepped = tmrel.set_index(['location_id','age_group_id']).sort_index()
tmrel = tmrel_prepped.drop(columns=[c for c in tmrel_prepped.columns if 'draw' not in c])
tmrel.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,draw_106,...,draw_990,draw_991,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999
location_id,age_group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
161,7,144.414119,141.557344,145.150825,143.170003,142.252572,143.046162,143.043219,147.584485,143.504062,144.839369,...,141.496118,142.479357,142.900909,140.68638,143.085189,144.359127,142.752477,143.047014,143.170331,142.887473
161,8,145.51931,142.765863,145.116936,144.808136,147.506752,141.992046,142.584145,145.361724,146.345165,143.43925,...,145.509728,144.052294,145.569535,145.244469,143.900673,141.421127,143.35802,143.996489,142.57622,146.605979
161,9,142.20924,144.935832,140.522295,141.898514,140.761631,140.618769,141.514814,141.979202,139.51829,142.129912,...,141.093082,142.405715,139.90229,140.186409,141.068784,143.335679,141.243691,140.599777,142.148803,143.324369
161,10,140.81609,139.762565,141.157402,139.750991,140.932748,140.087369,142.222926,140.120962,141.237806,140.623411,...,140.39152,141.562939,139.188997,140.43934,139.669666,139.610988,139.62617,140.55709,140.091003,139.838226
161,11,139.838347,140.038937,139.140928,140.215377,139.830406,140.095205,138.567371,140.506307,141.829433,139.924257,...,140.023174,140.478885,139.647963,140.504109,139.492966,139.609484,141.526073,140.466932,140.819622,139.995558


In [13]:
# get risk exposure
# pull this so we know what we are trying to validate to
exposure = get_draws('rei_id',
                 95,
                 source='exposure',
                 location_id=locs,
                 age_group_id=ages,
                 sex_id=2,
                 year_id=2019,
                 gbd_round_id=6,
                 decomp_step='step4'
                 )
exposure_prepped = exposure.set_index(['location_id','age_group_id']).sort_index()
exposure = exposure_prepped.drop(columns=[c for c in exposure_prepped.columns if 'draw' not in c])
exposure.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,draw_106,...,draw_990,draw_991,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999
location_id,age_group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
161,7,143.982226,141.205611,144.179062,142.553917,142.095402,142.807047,142.661232,146.665152,142.831306,144.699168,...,141.382981,141.879356,142.53796,140.210165,142.704937,143.978017,142.644507,142.748659,142.622869,142.470406
161,8,138.631006,139.865357,140.683744,141.181216,142.957374,137.697072,139.02502,140.76773,142.489192,141.191115,...,143.38225,140.689612,142.636452,143.312671,138.380608,138.85126,139.416112,139.429611,138.499544,143.599224
161,9,136.316704,138.298638,136.68551,137.952227,134.75311,137.259368,137.791383,137.102845,135.257369,136.019157,...,138.571655,137.76316,135.479329,135.782263,136.658099,140.122772,138.080186,135.739872,136.898334,139.86442
161,10,137.739625,136.844456,135.347847,135.9491,137.438926,135.660773,137.486898,136.927426,137.984428,135.047767,...,135.722447,136.358267,136.416512,137.215901,135.491371,135.631481,136.39507,137.502184,135.104909,136.031311
161,11,136.89363,135.879841,136.143282,138.580494,137.808347,137.624536,135.687205,137.015912,137.498561,136.49226,...,133.270147,137.742625,132.85256,137.560807,135.16427,136.472958,139.311471,137.755533,136.841027,135.321542


In [14]:
# get population mean hemoglobin 
# this will be used in exposure calculation
pop_hb = get_draws('modelable_entity_id',
                10487,
                source='epi',
                location_id=locs,
                age_group_id=ages,
                sex_id=2,
                year_id=2019,
                gbd_round_id=6,
                decomp_step='step4',
                status='best')
hb_prepped = pop_hb.set_index(['location_id','age_group_id']).sort_index()
hb = hb_prepped.drop(columns=[c for c in hb_prepped.columns if 'draw' not in c])
hb.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,draw_106,...,draw_990,draw_991,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999
location_id,age_group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
161,7,131.576669,130.006851,126.753823,128.018065,133.273591,131.115555,130.538169,127.788556,127.888119,134.269665,...,132.629892,127.563386,131.561974,128.47908,130.254982,130.120093,132.948881,131.693836,129.286983,129.131233
161,8,118.922773,126.221582,123.35438,126.262134,124.932018,122.562494,124.733401,123.76613,126.417171,128.350657,...,128.983925,125.823625,128.28924,130.669737,119.89713,127.467974,124.626547,123.436502,123.562551,128.478574
161,9,119.445416,119.744885,123.663548,124.283216,118.579743,124.817976,123.868556,121.465009,121.658608,119.139191,...,127.191962,121.699027,120.712246,121.153322,121.37069,125.672017,124.850055,120.869506,120.472913,125.833589
161,10,125.283728,125.009121,119.397039,122.817539,123.870315,121.480207,122.020435,124.570332,124.744182,119.359498,...,121.166555,119.789437,125.065134,124.389494,121.221193,121.139373,124.053072,125.456108,120.019112,122.695076
161,11,125.122521,121.147369,124.487696,128.703954,127.822311,125.583439,122.830292,124.109611,122.808416,123.786324,...,114.836797,125.800045,115.827505,125.078032,120.85868,124.07956,127.209902,126.128345,122.630509,120.700463


In [15]:
# calculate shiftprev as they did in the "make_shiftprev_draws" function
# located here: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/nutrition_iron_exposure.py
    # note: normal_df confirmed to be tmrel in separate notebook

shift_prev = prev_out_meids_prepped * (tmrel - hb)
pd.DataFrame(shift_prev.mean(axis=1))


# this is prevalence of all anemia except dietary iron deficiency anemia
# multiplied by the difference in the mean hemoglobin and 95th percentile hbg

# what could this mean???

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,0.549923
161,8,3.633803
161,9,3.996935
161,10,3.860055
161,11,3.813474
161,12,2.375671
161,13,1.972167
161,14,1.768849
161,15,1.990915


In [16]:
# calculate exposure as they did in the "subtract from counterfactual" function
# located here: https://stash.ihme.washington.edu/projects/MNCH/repos/anemia_causal_attribution/browse/iron_deficiency/nutrition_iron_exposure.py

exp_test = tmrel - shift_prev
pd.DataFrame(exp_test.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,142.878136
161,8,140.431257
161,9,137.504466
161,10,136.592757
161,11,136.27777
161,12,137.632038
161,13,137.827145
161,14,139.453708
161,15,139.159076


In [17]:
pd.DataFrame(exposure.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,142.878136
161,8,140.431257
161,9,137.504466
161,10,136.592757
161,11,136.27777
161,12,137.632038
161,13,137.827145
161,14,139.453708
161,15,139.159076


In [18]:
error = exposure - exp_test

In [19]:
(error.min()).min()

-5.684341886080802e-14

In [20]:
(error.max()).max()

5.684341886080802e-14

In [21]:
pd.DataFrame(error.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,3.694822e-16
161,8,-2.273737e-16
161,9,7.389644e-16
161,10,5.400125e-16
161,11,3.126388e-16
161,12,-1.421085e-16
161,13,8.526513e-17
161,14,-7.105427e-16
161,15,-7.389644e-16


## Conclusion: this method successfully recreates the iron deficiency risk factor exposure :) 

NOTES:

- This method is different than expected based on the risk factor methods appendix in the following ways:

    - The TMREL appears to be the 95th percentile of the global population hemoglobin mean distribution (covered in section #1)
    - The exposure value appears to be not specific to iron responsive causes, but rather for dietary iron deficiency only (or everything but dietary iron deficiency...).
    
I would have expected exposure = tmrel - p_iron_responsive_causes * (tmrel - hb_population_mean) based on the methods appendix

# Now try to recreate the iron deficiency risk factor PAF for maternal disorders

In [24]:
# get relative risks 
# (these should be defined in terms of unit of iron def. exposure away from TMREL)
    # per iron deficiency risk factor modeler
rr = get_draws('rei_id',
                 95,
                 source='rr',
                 location_id=1,
                 age_group_id=ages,
                 sex_id=2,
                 year_id=2019,
                 gbd_round_id=6,
                 decomp_step='step4'
                 )

rr_prepped = rr.loc[rr.cause_id == 367]
rr_prepped = rr_prepped.set_index(['age_group_id']).sort_index()
rr = rr_prepped.drop(columns=[c for c in rr_prepped.columns if 'draw' not in c])
rr.head()

# note, this dataframe is age- and cause-specific, but values are
# identical across all ages and causes
# (all causes are most detailed within maternal disorders)

Unnamed: 0_level_0,draw_0,draw_1,draw_10,draw_100,draw_101,draw_102,draw_103,draw_104,draw_105,draw_106,...,draw_990,draw_991,draw_992,draw_993,draw_994,draw_995,draw_996,draw_997,draw_998,draw_999
age_group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,1.212408,1.350076,1.334512,1.358255,1.197099,1.351358,1.230164,1.1808,1.221371,1.355961,...,1.189681,1.315712,1.28604,1.257695,1.216908,1.177559,1.315347,1.221176,1.276526,1.243538
8,1.212408,1.350076,1.334512,1.358255,1.197099,1.351358,1.230164,1.1808,1.221371,1.355961,...,1.189681,1.315712,1.28604,1.257695,1.216908,1.177559,1.315347,1.221176,1.276526,1.243538
9,1.212408,1.350076,1.334512,1.358255,1.197099,1.351358,1.230164,1.1808,1.221371,1.355961,...,1.189681,1.315712,1.28604,1.257695,1.216908,1.177559,1.315347,1.221176,1.276526,1.243538
10,1.212408,1.350076,1.334512,1.358255,1.197099,1.351358,1.230164,1.1808,1.221371,1.355961,...,1.189681,1.315712,1.28604,1.257695,1.216908,1.177559,1.315347,1.221176,1.276526,1.243538
11,1.212408,1.350076,1.334512,1.358255,1.197099,1.351358,1.230164,1.1808,1.221371,1.355961,...,1.189681,1.315712,1.28604,1.257695,1.216908,1.177559,1.315347,1.221176,1.276526,1.243538


In [25]:
# adjust RRs to location-specific difference between exposure and TMREL
rr_adj = np.exp(np.log(rr) * (tmrel - exposure))
pd.DataFrame(rr_adj.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,1.133594
161,8,2.418258
161,9,2.674185
161,10,2.54305
161,11,2.538962
161,12,1.757137
161,13,1.580633
161,14,1.510475
161,15,1.599625


In [30]:
# pull paf from burdenator -- what we are trying to validate to
pafs = get_draws(['cause_id','rei_id'],
                            [367,95],
                            source='burdenator',
                            location_id=locs,
                            measure_id=[3],
                            metric_id=2,
                            year_id=[2019],
                            gbd_round_id=6,
                            decomp_step='step5',
                            status='best')

pafs = pafs.set_index(['location_id','age_group_id'])
pafs = pafs.drop(columns=[c for c in pafs.columns if 'draw' not in c])
pd.DataFrame(pafs.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,0.116808
161,8,0.162377
161,9,0.172552
161,10,0.171709
161,11,0.178884
161,12,0.159704
161,13,0.156818
161,14,0.156102
161,15,0.160445


In [31]:
# back calculate exposure prevalence using the PAF equation

# PAF = ((p*RR) + (1-p) - 1)/(p*RR + 1 - p)
# reformats to...

p_back_calc = -pafs / (pafs * rr_adj - pafs - rr_adj + 1)
pd.DataFrame(p_back_calc.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,1.436391
161,8,0.175975
161,9,0.159439
161,10,0.166866
161,11,0.180016
161,12,0.303911
161,13,0.379749
161,14,0.432714
161,15,0.390465


In [39]:
# calculate PAF from exposure prevalence and relative risks for iron deficiency

p = p_back_calc
rrs = rr_adj

paf_test = (p * rrs + (1 - p) - 1) / (p * rrs + (1 - p))
pd.DataFrame(paf_test.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,0.116808
161,8,0.162377
161,9,0.172552
161,10,0.171709
161,11,0.178884
161,12,0.159704
161,13,0.156818
161,14,0.156102
161,15,0.160445


In [33]:
# calculate error in our calculated PAF relative to burdenator PAF
paf_error = pafs - paf_test

In [34]:
(paf_error.min()).min()

-5.828670879282072e-16

In [35]:
(paf_error.max()).max()

1.27675647831893e-15

In [36]:
pd.DataFrame(paf_error.mean(axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
location_id,age_group_id,Unnamed: 2_level_1
161,7,-1.595946e-18
161,8,2.775558e-18
161,9,3.295975e-19
161,10,1.589007e-18
161,11,-1.373901e-18
161,12,-1.3808400000000001e-18
161,13,-1.0408339999999999e-20
161,14,-2.775558e-20
161,15,-7.02563e-19


## Conclusion:

- Need to look further into recreation of the PAF
- Were the RRs adjusted correctly (does this need to be done among pregnant women specifically?)
- Why is the back-solved prevalence of iron def. > 1 for age_group_id==7?
- What does the back-solved prevalence represent? It is NOT the same prevalence that is used in calculating the exposure

NOTE: I have not yet found code specific to this. Maybe this is done by central comp?