In [2]:
from db_queries import get_population, get_ids
from db_queries import get_location_metadata as get_locs

In [3]:
import pandas as pd, numpy as np

# LSFF: choose population coverage data by hand for tier 5 countries

## vehicles: Wheat flour, maize flour, oil

## countries: 	Angola, China, Ghana, Niger, Egypt, Sudan, Madagascar

In [4]:
## load legal combos
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_vehicle_country_pairs.pickle', 'rb') as handle:
    vehicle_country_map = pickle.load(handle)

In [5]:
nutrients = ['iron','zinc','folic acid','vitamin a']

In [6]:
data_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/extraction_sheet_lsff_03_24_2021.3.csv'
assm_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/extraction_sheet_lsff_assumed_coverage_03_24_2021.csv'

df = pd.read_csv(data_path)

In [7]:
df.loc[df.nutrient=="folic acid, folate, b9",'nutrient']= 'folic acid'

In [8]:
assum = pd.read_csv(assm_path)

In [9]:
#these don't apply this time
assum.location_name.unique()

array(['Ethiopia', 'Myanmar', 'India'], dtype=object)

In [10]:
assum.loc[assum.nutrient=="folic acid, folate, b9",'nutrient']= 'folic acid'

In [11]:
df['estimation_status'] = 'na'

In [12]:
df['data_choice_notes'] = ""

In [13]:
mult_estimates_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_data_estimated_03_26_2021.4.csv'

mult_estimates = pd.read_csv(mult_estimates_path)

In [14]:
#reformat
mult_estimates.loc[(mult_estimates.B_estimate!=1.0),'B'] = np.nan
mult_estimates.loc[(mult_estimates.C_estimate!=1.0),'C'] = np.nan

mult_estimates = pd.melt(mult_estimates,
                         id_vars = ['location_name','vehicle','nutrient','standard'],
                         value_vars = ['B','C'], var_name = 'value_description', value_name = 'value_mean').dropna()

mult_estimates.loc[(mult_estimates.value_description=="B"),'nutrient'] = 'NA'

mult_estimates = mult_estimates.drop_duplicates()

mult_estimates.value_description = mult_estimates.value_description.map({
    'B':'percent of population eating industrially produced vehicle',
    'C':'percent of population eating fortified vehicle'
})

In [15]:
mult_estimates['estimation_status'] = 'multiplicative'

In [16]:
def prep_reg_estimates(path):
    draws = [f'draw_{i}' for i in range(500)]

    df = pd.read_csv(path)
    df = df.groupby(['location_name','vehicle']).mean().reset_index()
    df['value_mean'] = df[draws].mean(axis=1)
    df['value_025_percentile'] = df[draws].quantile(.025, axis=1)
    df['value_975_percentile'] = df[draws].quantile(.975, axis=1)

    return df[['location_name','vehicle','value_mean']]

In [19]:
output_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/'

reg_fort_oil_path = output_dir + 'pct_eating_fortified_oil_regression_estimates_3_22_2021.csv'
reg_fort_wheat_path = output_dir + 'pct_eating_fortified_wheat_regression_estimates_3_22_2021.csv'
reg_fort_maize_path = output_dir + 'pct_eating_fortified_maize_regression_estimates_3_22_2021.csv'

est_fortified = pd.concat([prep_reg_estimates(path) for path in [reg_fort_oil_path,reg_fort_wheat_path,reg_fort_maize_path]])
est_fortified['value_description'] = "percent of population eating fortified vehicle"
est_fortified['estimation_status'] = "regression"

In [20]:
vn_pairs = pd.DataFrame([(v,n) for v in ['oil','wheat flour','maize flour'] for n in vehicle_nutrient_map[v]],
            columns=['vehicle','nutrient'])

In [21]:
est_fortified = est_fortified.merge(vn_pairs, on = 'vehicle', how = 'outer')[['location_name','vehicle','value_description','nutrient','value_mean','estimation_status']]

In [22]:
ind_prod_oil_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_oil_regression_estimates_3_26_2021.csv'
ind_prod_wheat_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_wheat_regression_estimates_3_26_2021.csv'
ind_prod_maize_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_maize_regression_estimates_3_26_2021.csv'

est_fortifiable = pd.concat([prep_reg_estimates(path) for path in [ind_prod_oil_path,ind_prod_wheat_path,ind_prod_maize_path]])
est_fortifiable['value_description'] = "percent of population eating industrially produced vehicle"
est_fortifiable['estimation_status'] = "regression"
est_fortifiable['nutrient'] = "na"

In [23]:
eating_oil_path = output_dir + 'pct_eating_oil_regression_estimates_3_23_2021.csv'
eating_wheat_path = output_dir + 'pct_eating_wheat_regression_estimates_3_22_2021.csv'
eating_maize_path = output_dir + 'pct_eating_maize_regression_estimates_3_22_2021.csv'

est_eating = pd.concat([prep_reg_estimates(path) for path in [eating_oil_path,eating_wheat_path,eating_maize_path]])
est_eating['value_description'] = "percent of population eating vehicle"
est_eating['estimation_status'] = "regression"
est_eating['nutrient'] = "na"

In [24]:
reg_estimates = pd.concat([est_eating, est_fortifiable, est_fortified])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [27]:
location_names = ['Angola', 'China', 'Ghana', 'Niger', 'Egypt', 'Sudan', 'Madagascar']
vehicles = ['maize flour','wheat flour','oil']
nutrients = ['folic acid','iron','zinc','vitamin a']

In [28]:
# these are the vehicles per country we need
target_high_level = pd.DataFrame([(loc,v) for loc in location_names for v in country_vehicle_map[loc]],
            columns=['location_name','vehicle']).sort_values(['location_name','vehicle'])

target_high_level = target_high_level[target_high_level.vehicle.isin(vehicles)].set_index(['location_name','vehicle'])

target_high_level

location_name,vehicle
Angola,maize flour
Angola,oil
Angola,wheat flour
China,oil
China,wheat flour
Egypt,maize flour
Egypt,oil
Egypt,wheat flour
Ghana,maize flour
Ghana,oil


In [29]:
target_a = pd.DataFrame([(loc,vehicle,nutrient,'percent of population eating fortified vehicle') for loc in location_names
                       for vehicle in country_vehicle_map[loc]
                      for nutrient in vehicle_nutrient_map[vehicle]],
            columns=['location_name','vehicle','nutrient','value_description']).sort_values(['location_name','vehicle','nutrient'])
target_a = target_a[(target_a.nutrient.isin(nutrients))]

target_b = pd.DataFrame([(loc,vehicle,'na',val) for loc in location_names
                       for vehicle in country_vehicle_map[loc]
                      for val in ['percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']],
            columns=['location_name','vehicle','nutrient','value_description'])

In [30]:
sortvars = ['location_name','vehicle','value_description','nutrient']
target = target_a.append(target_b)
target = target[(target.vehicle.isin(vehicles)) & (target.nutrient.isin(nutrients + ['na']))].sort_values(sortvars).set_index(sortvars)

In [31]:
target = target.reset_index()
rcols = target.columns.tolist()

In [32]:
check_cols = ['location_id','location_name','urbanicity','subnational_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','sub_population','source_year','notes','source_citation','source_link','inclusion_justification','included','data_choice_notes']

def filter_data(country, vehicle, val):    
    output = df.loc[(df.location_name==country)
           & (df.vehicle==vehicle)
           & (df.value_description==val)
           & (df.value_mean.notna()),check_cols]
    
    return output

In [33]:
def check_one_country(country):
    vehicles = ['oil', 'wheat flour', 'salt', 'maize flour', 'rice', 'bouillon']
    values_gold = ['percent of population eating fortified vehicle',
               'percent of population eating industrially produced vehicle',
               'percent of population eating vehicle']
    return pd.concat([filter_data(country, vehicle, val) for vehicle in vehicles for val in values_gold])

In [34]:
usecols = ['location_id','location_name','subnational_name','vehicle','value_description','nutrient','value_mean', 'value_025_percentile',
       'value_975_percentile']
subset_data = {}

In [35]:
for i in location_names:
    subset_data[i] = pd.DataFrame()

In [36]:
location_names

['Angola', 'China', 'Ghana', 'Niger', 'Egypt', 'Sudan', 'Madagascar']

## Angola

In [37]:
angola = check_one_country("Angola")

angola[(angola.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1


In [39]:
mult_estimates[(mult_estimates.location_name=="Angola")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


In [40]:
# wow, yikes

## China

In [41]:
china = check_one_country("China")

china[(china.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1


In [42]:
mult_estimates[(mult_estimates.location_name=="China")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


In [43]:
# oh gosh...

## Ghana

In [44]:
ghana = check_one_country("Ghana")

ghana[(ghana.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
oil,percent of population eating fortified vehicle,
oil,percent of population eating vehicle,


In [45]:
ghana.loc[(ghana.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
890,,Ghana,urban,na,oil,percent of population eating fortified vehicle,vitamin a,34.9,,,total population,2014,,Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,
891,,Ghana,rural,na,oil,percent of population eating fortified vehicle,vitamin a,19.4,,,total population,2014,,Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,
892,,Ghana,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,32.9,,,total population,2014,,Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,


In [47]:
ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.value_description=="percent of population eating fortified vehicle"),
         "data_choice_notes"] = "Only one source; discarded urban and rural specific for nationally representative"

subset_data['Ghana'] = subset_data['Ghana'].append(
    ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.value_description=="percent of population eating fortified vehicle")]
)

ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
892,,Ghana,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,32.9,,,total population,2014,,Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,Only one source; discarded urban and rural spe...


In [55]:
ghana.loc[(ghana.value_description=="percent of population eating vehicle") & (ghana.vehicle=="oil")]


Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
889,,Ghana,mixed/both,na,oil,percent of population eating vehicle,na,98.0,,,total population,2004,"The study cites this fact to this source, whic...","Nyumuah RO, Hoang TC, Amoaful EF, Agble R, Mey...",https://journals.sagepub.com/doi/10.1177/15648...,,,
893,,Ghana,urban,na,oil,percent of population eating vehicle,na,94.7,,,total population,2014,"DHS survey. question = ""what type of oil does ...",Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,
894,,Ghana,rural,na,oil,percent of population eating vehicle,na,96.8,,,total population,2014,"DHS survey. question = ""what type of oil does ...",Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,
895,,Ghana,mixed/both,na,oil,percent of population eating vehicle,na,95.6,,,total population,2014,"DHS survey. question = ""what type of oil does ...",Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,


In [56]:
ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.source_year=="2014") & (ghana.value_description=="percent of population eating vehicle") & (ghana.vehicle=="oil"),
         "data_choice_notes"] = "Discarded 2004 Nyumuah number (98%) in lieu of 2014 DHS number (95.6%)"

subset_data['Ghana'] = subset_data['Ghana'].append(
    ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.source_year=="2014") & (ghana.value_description=="percent of population eating vehicle") & (ghana.vehicle=="oil")]
)

ghana.loc[(ghana.urbanicity=="mixed/both") & (ghana.source_year=="2014") & (ghana.value_description=="percent of population eating vehicle") & (ghana.vehicle=="oil")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
895,,Ghana,mixed/both,na,oil,percent of population eating vehicle,na,95.6,,,total population,2014,"DHS survey. question = ""what type of oil does ...",Ghana and Orc Macro. “Ghana Demographic and He...,https://dhsprogram.com/pubs/pdf/FR307/FR307.pdf,,,Discarded 2004 Nyumuah number (98%) in lieu of...


In [58]:
mult_estimates[(mult_estimates.location_name=="Ghana")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
475,Ghana,oil,vitamin d,,percent of population eating fortified vehicle,0.0,multiplicative


In [59]:
subset_data['Ghana'] = subset_data['Ghana'].append(
    mult_estimates[(mult_estimates.location_name=="Ghana")]
)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


## Niger

In [61]:
niger = check_one_country("Niger")

niger[(niger.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
oil,percent of population eating fortified vehicle,
oil,percent of population eating vehicle,
wheat flour,percent of population eating fortified vehicle,


In [62]:
niger[(niger.vehicle=="oil") & (niger.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
286,,Niger,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,55,na,na,total population,2010,"""estimated coverage""","Corner, S. S. Nutrition and Business.",http://www.unscn.org/files/Publications/SCN_Ne...,,,
616,,Niger,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,55,na,na,total population,2010,TABLE 2. Estimated coverage of vitamin A–forti...,"Sablah M, Klopp J, Steinberg D, Touaoro Z, Lai...",https://pubmed.ncbi.nlm.nih.gov/23444712/,,,
617,,Niger,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,75,na,na,total population,2013,TABLE 2. Estimated coverage of vitamin A–forti...,"Sablah M, Klopp J, Steinberg D, Touaoro Z, Lai...",https://pubmed.ncbi.nlm.nih.gov/23444712/,,,


In [64]:
niger.loc[(niger.source_year=="2013") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating fortified vehicle"),
         "data_choice_notes"] = "From two sources, we had an estimate of 55% in 2010. The latter source additionally projected 75% by 2013, which we are using here."

subset_data['Niger'] = subset_data['Niger'].append(
    niger.loc[(niger.source_year=="2013") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating fortified vehicle")]
)

niger.loc[(niger.source_year=="2013") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
617,,Niger,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,75,na,na,total population,2013,TABLE 2. Estimated coverage of vitamin A–forti...,"Sablah M, Klopp J, Steinberg D, Touaoro Z, Lai...",https://pubmed.ncbi.nlm.nih.gov/23444712/,,,"From two sources, we had an estimate of 55% in..."


In [65]:
niger[(niger.vehicle=="oil") & (niger.value_description=="percent of population eating vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
289,,Niger,mixed/both,na,oil,percent of population eating vehicle,na,85,na,na,women of reproductive age,2001,proportion of women who consumed vehicle in th...,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,,,
298,,Niger,urban,na,oil,percent of population eating vehicle,na,98,na,na,women of reproductive age,2001,proportion of women who consumed vehicle in th...,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,,,
299,,Niger,rural,na,oil,percent of population eating vehicle,na,76,na,na,women of reproductive age,2001,proportion of women who consumed vehicle in th...,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,,,


In [67]:
niger.loc[(niger.urbanicity=="mixed/both") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating vehicle"),
         "data_choice_notes"] = "Only one source. Discarded urban and rural specific estimates in lieu of the total estimate"

subset_data['Niger'] = subset_data['Niger'].append(
    niger.loc[(niger.urbanicity=="mixed/both") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating vehicle")]
)

niger.loc[(niger.urbanicity=="mixed/both") & (niger.vehicle=="oil") & (niger.value_description=="percent of population eating vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
289,,Niger,mixed/both,na,oil,percent of population eating vehicle,na,85,na,na,women of reproductive age,2001,proportion of women who consumed vehicle in th...,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,,,Only one source. Discarded urban and rural spe...


In [68]:
niger[(niger.vehicle=="wheat flour") & (niger.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
287,,Niger,mixed/both,na,wheat flour,percent of population eating fortified vehicle,iron,30,na,na,total population,2010,"""estimated coverage"". obtained fortificants fr...","Corner, S. S. Nutrition and Business.",http://www.unscn.org/files/Publications/SCN_Ne...,,,
288,,Niger,mixed/both,na,wheat flour,percent of population eating fortified vehicle,folic acid,30,na,na,total population,2010,"""estimated coverage"". obtained fortificants fr...","Corner, S. S. Nutrition and Business.",http://www.unscn.org/files/Publications/SCN_Ne...,,,


In [69]:
niger.loc[(niger.vehicle=="wheat flour") & (niger.value_description=="percent of population eating fortified vehicle"),
         "data_choice_notes"] = "Only one source."

subset_data['Niger'] = subset_data['Niger'].append(
    niger[(niger.vehicle=="wheat flour") & (niger.value_description=="percent of population eating fortified vehicle")]
)

niger[(niger.vehicle=="wheat flour") & (niger.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
287,,Niger,mixed/both,na,wheat flour,percent of population eating fortified vehicle,iron,30,na,na,total population,2010,"""estimated coverage"". obtained fortificants fr...","Corner, S. S. Nutrition and Business.",http://www.unscn.org/files/Publications/SCN_Ne...,,,Only one source.
288,,Niger,mixed/both,na,wheat flour,percent of population eating fortified vehicle,folic acid,30,na,na,total population,2010,"""estimated coverage"". obtained fortificants fr...","Corner, S. S. Nutrition and Business.",http://www.unscn.org/files/Publications/SCN_Ne...,,,Only one source.


In [72]:
mult_estimates[(mult_estimates.location_name=="Niger") 
               & (mult_estimates.vehicle.isin(vehicles)) 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


## Egypt

In [74]:
egypt = check_one_country("Egypt")

egypt[(egypt.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1


In [83]:
mult_estimates[(mult_estimates.location_name=="Egypt") 
               & (mult_estimates.vehicle.isin(vehicles)) 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


## Sudan

In [78]:
sudan = check_one_country("Sudan")

sudan[(sudan.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
oil,percent of population eating fortified vehicle,
oil,percent of population eating industrially produced vehicle,


In [80]:
sudan.loc[(sudan.value_description=="percent of population eating fortified vehicle"),
         "data_choice_notes"] = "Only one source."

subset_data['Sudan'] = subset_data['Sudan'].append(
    sudan.loc[(sudan.value_description=="percent of population eating fortified vehicle")]
)

sudan.loc[(sudan.value_description=="percent of population eating fortified vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
319,,Sudan,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,80,na,na,total population,2019,,GFDx,https://fortificationdata.org/country-fortific...,,,Only one source.


In [82]:
sudan.loc[(sudan.value_description=="percent of population eating industrially produced vehicle"),
         "data_choice_notes"] = "Only one source."

subset_data['Sudan'] = subset_data['Sudan'].append(
    sudan.loc[(sudan.value_description=="percent of population eating industrially produced vehicle")]
)

sudan.loc[(sudan.value_description=="percent of population eating industrially produced vehicle")]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
849,,Sudan,mixed/both,na,oil,percent of population eating industrially prod...,na,80,na,na,total population,2019,Proportion of population consuming industriall...,GFDx,https://fortificationdata.org/country-fortific...,,,Only one source.


In [84]:
mult_estimates[(mult_estimates.location_name=="Sudan") 
               & (mult_estimates.vehicle.isin(vehicles)) 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


## Madagascar

In [86]:
mada = check_one_country("Madagascar")

mada[(mada.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1


In [87]:
mult_estimates[(mult_estimates.location_name=="Madagascar") 
               & (mult_estimates.vehicle.isin(vehicles)) 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


## pop-weight subnationals

In [89]:
#these are the subnats we have to weight

checkout = pd.concat(list(subset_data.values()))
checkout.loc[(checkout.subnational_name.notna()) & (checkout.subnational_name!='na'),
         ['location_name','urbanicity','subnational_name','source_link']].drop_duplicates()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,location_name,urbanicity,subnational_name,source_link


## Check for missingness

In [90]:
all_data = pd.concat(list(subset_data.values()))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [91]:
all_data.data_choice_notes.unique()

array(['Only one source; discarded urban and rural specific for nationally representative',
       'Discarded 2004 Nyumuah number (98%) in lieu of 2014 DHS number (95.6%)',
       nan,
       'From two sources, we had an estimate of 55% in 2010. The latter source additionally projected 75% by 2013, which we are using here.',
       'Only one source. Discarded urban and rural specific estimates in lieu of the total estimate',
       'Only one source.'], dtype=object)

In [92]:
all_data.loc[~(all_data.nutrient.isin(['vitamin a','iron','zinc','folic acid'])),'nutrient'] = 'na'

In [93]:
all_data[rcols + ['value_mean','value_025_percentile','value_975_percentile','sub_population']]

Unnamed: 0,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population
892,Ghana,oil,percent of population eating fortified vehicle,vitamin a,32.9,,,total population
895,Ghana,oil,percent of population eating vehicle,na,95.6,,,total population
475,Ghana,oil,percent of population eating fortified vehicle,na,0.0,,,
617,Niger,oil,percent of population eating fortified vehicle,vitamin a,75.0,na,na,total population
289,Niger,oil,percent of population eating vehicle,na,85.0,na,na,women of reproductive age
287,Niger,wheat flour,percent of population eating fortified vehicle,iron,30.0,na,na,total population
288,Niger,wheat flour,percent of population eating fortified vehicle,folic acid,30.0,na,na,total population
319,Sudan,oil,percent of population eating fortified vehicle,vitamin a,80.0,na,na,total population
849,Sudan,oil,percent of population eating industrially prod...,na,80.0,na,na,total population


In [94]:
check = target.merge(all_data[rcols + ['value_mean']], on = rcols, how = 'left')

In [95]:
check

Unnamed: 0,location_name,vehicle,value_description,nutrient,value_mean
0,Angola,maize flour,percent of population eating fortified vehicle,folic acid,
1,Angola,maize flour,percent of population eating fortified vehicle,iron,
2,Angola,maize flour,percent of population eating fortified vehicle,vitamin a,
3,Angola,maize flour,percent of population eating fortified vehicle,zinc,
4,Angola,maize flour,percent of population eating industrially prod...,na,
...,...,...,...,...,...
94,Sudan,wheat flour,percent of population eating fortified vehicle,iron,
95,Sudan,wheat flour,percent of population eating fortified vehicle,vitamin a,
96,Sudan,wheat flour,percent of population eating fortified vehicle,zinc,
97,Sudan,wheat flour,percent of population eating industrially prod...,na,


In [96]:
need_reg = check.loc[check.value_mean.isna(),['value_description','vehicle','location_name','nutrient']]

In [97]:
need_reg = need_reg.merge(reg_estimates, on = ['value_description','vehicle','location_name','nutrient'], how = 'left')

In [98]:
need_reg

Unnamed: 0,value_description,vehicle,location_name,nutrient,estimation_status,value_mean
0,percent of population eating fortified vehicle,maize flour,Angola,folic acid,regression,22.485675
1,percent of population eating fortified vehicle,maize flour,Angola,iron,regression,22.485675
2,percent of population eating fortified vehicle,maize flour,Angola,vitamin a,regression,22.485675
3,percent of population eating fortified vehicle,maize flour,Angola,zinc,regression,22.485675
4,percent of population eating industrially prod...,maize flour,Angola,na,regression,5.550861
...,...,...,...,...,...,...
86,percent of population eating fortified vehicle,wheat flour,Sudan,iron,regression,1.760469
87,percent of population eating fortified vehicle,wheat flour,Sudan,vitamin a,regression,1.760469
88,percent of population eating fortified vehicle,wheat flour,Sudan,zinc,regression,1.760469
89,percent of population eating industrially prod...,wheat flour,Sudan,na,regression,35.365713


In [99]:
all_data = all_data.append(need_reg)

In [100]:
all_data.value_mean = all_data.value_mean.astype(float)

In [101]:
fort = all_data[(all_data.value_description=="percent of population eating fortified vehicle")]
other = all_data[(all_data.value_description!="percent of population eating fortified vehicle")]

In [102]:
fort = pd.pivot_table(fort, index=['location_name','vehicle'],values = 'value_mean', columns = 'value_description')

In [103]:
other = pd.pivot_table(other, index=['location_name','vehicle'],values = 'value_mean', columns = 'value_description')

In [104]:
validate = fort.reset_index().merge(other.reset_index(), on = ['location_name','vehicle'], how = 'outer').set_index(['location_name','vehicle'])

In [105]:
validate.columns = [i.replace(" ","_") for i in validate.columns]

In [106]:
validate

Unnamed: 0_level_0,Unnamed: 1_level_0,percent_of_population_eating_fortified_vehicle,percent_of_population_eating_industrially_produced_vehicle,percent_of_population_eating_vehicle
location_name,vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Angola,maize flour,22.485675,5.550861,51.172808
Angola,oil,42.655477,35.63543,90.384708
Angola,wheat flour,20.670828,23.271861,27.183242
China,oil,21.773873,72.47479,57.651373
China,wheat flour,3.575856,25.309156,81.858276
Egypt,maize flour,0.015533,11.97932,74.329176
Egypt,oil,23.617015,88.579676,54.973297
Egypt,wheat flour,22.463039,47.003704,88.474975
Ghana,maize flour,6.207181,8.454993,82.285414
Ghana,oil,16.45,89.345779,95.6


In [107]:
validate.loc[(validate.percent_of_population_eating_fortified_vehicle > validate.percent_of_population_eating_industrially_produced_vehicle) | (validate.percent_of_population_eating_industrially_produced_vehicle > validate.percent_of_population_eating_vehicle)]

Unnamed: 0_level_0,Unnamed: 1_level_0,percent_of_population_eating_fortified_vehicle,percent_of_population_eating_industrially_produced_vehicle,percent_of_population_eating_vehicle
location_name,vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Angola,maize flour,22.485675,5.550861,51.172808
Angola,oil,42.655477,35.63543,90.384708
China,oil,21.773873,72.47479,57.651373
Egypt,oil,23.617015,88.579676,54.973297
Madagascar,maize flour,6.362469,4.485707,92.959155
Madagascar,oil,21.445559,85.031868,75.034608
Madagascar,wheat flour,32.625994,25.987012,45.221202
Niger,maize flour,6.220914,3.830473,70.42632
Niger,oil,75.0,54.403014,85.0
Niger,wheat flour,30.0,44.595706,41.464396


In [108]:
check = target.merge(all_data[rcols + ['value_mean']], on = rcols, how = 'left')

In [109]:
assert(len(check[check.value_mean.isna()])==0), "there are target loc/vehicle/val/nutrient combos youre missing"

In [110]:
rcols

['location_name', 'vehicle', 'value_description', 'nutrient']

In [111]:
output = all_data[rcols + ['value_mean','value_025_percentile', 'value_975_percentile','sub_population','estimation_status','source_citation','source_link','data_choice_notes']].sort_values(rcols).set_index(rcols)

In [112]:
##impute all missing CIs

# clean value_mean
output.loc[output.value_mean=='na','value_mean'] = np.nan
output.value_mean = output.value_mean.astype(float)

# clean 2.5th %ile
output.loc[output.value_025_percentile=='na','value_025_percentile'] = np.nan
output.value_025_percentile = output.value_025_percentile.astype(float)

# clean 97.5th %ile
output.loc[output.value_975_percentile=='na','value_975_percentile'] = np.nan
output.value_975_percentile = output.value_975_percentile.astype(float)

  result = method(y)


In [113]:
output.loc[(output.value_mean > output.value_975_percentile),'value_975_percentile'] = np.nan

In [114]:
output.loc[(output.value_mean < output.value_025_percentile)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,value_mean,value_025_percentile,value_975_percentile,sub_population,estimation_status,source_citation,source_link,data_choice_notes
location_name,vehicle,value_description,nutrient,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [115]:
output.loc[(output.value_mean == output.value_025_percentile)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,value_mean,value_025_percentile,value_975_percentile,sub_population,estimation_status,source_citation,source_link,data_choice_notes
location_name,vehicle,value_description,nutrient,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [116]:
output['scale_over_mean'] = (output.value_975_percentile - output.value_025_percentile) / output.value_mean

In [117]:
output = output.reset_index()

In [118]:
r = output.loc[(output.scale_over_mean!=np.inf),['vehicle','scale_over_mean']]
# .groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'}).reset_index()

In [119]:
r_mean = r.scale_over_mean.mean()

In [120]:
r.loc[r.vehicle=="maize flour",'scale_over_mean'] = r_mean

In [121]:
r = r.groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'}).reset_index()

In [122]:
# add uncertainty
output = output.merge(r, on = 'vehicle', how = 'outer')

In [123]:
output.loc[(output.estimation_status=="regression"),'r'] = output.r * 2

In [124]:
output['lower'] = np.clip(output.value_mean - (output.r * output.value_mean)/2, 0, 100)
output['upper'] = np.clip(output.value_mean + (output.r * output.value_mean)/2, 0, 100)

In [125]:
output

Unnamed: 0,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,estimation_status,source_citation,source_link,data_choice_notes,scale_over_mean,r,lower,upper
0,Angola,maize flour,percent of population eating fortified vehicle,folic acid,22.485675,,,,regression,,,,,,,
1,Angola,maize flour,percent of population eating fortified vehicle,iron,22.485675,,,,regression,,,,,,,
2,Angola,maize flour,percent of population eating fortified vehicle,vitamin a,22.485675,,,,regression,,,,,,,
3,Angola,maize flour,percent of population eating fortified vehicle,zinc,22.485675,,,,regression,,,,,,,
4,Angola,maize flour,percent of population eating industrially prod...,na,5.550861,,,,regression,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Sudan,wheat flour,percent of population eating fortified vehicle,iron,1.760469,,,,regression,,,,,,,
96,Sudan,wheat flour,percent of population eating fortified vehicle,vitamin a,1.760469,,,,regression,,,,,,,
97,Sudan,wheat flour,percent of population eating fortified vehicle,zinc,1.760469,,,,regression,,,,,,,
98,Sudan,wheat flour,percent of population eating industrially prod...,na,35.365713,,,,regression,,,,,,,


In [126]:
output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna()),'CI_source'] = "modeling"
output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna()),'CI_source'] = "modeling"

output.loc[output.CI_source.isna(),'CI_source'] = 'extraction'

output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna()),'value_025_percentile'] = output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna())].lower
output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna()),'value_975_percentile'] = output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna())].upper

output = output.drop(columns=['r','lower','upper','scale_over_mean']).set_index(rcols)

output.loc[output.estimation_status.isna(),'estimation_status'] = 'na'

  raw_cell, store_history, silent, shell_futures)


In [127]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,value_mean,value_025_percentile,value_975_percentile,sub_population,estimation_status,source_citation,source_link,data_choice_notes,CI_source
location_name,vehicle,value_description,nutrient,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Angola,maize flour,percent of population eating fortified vehicle,folic acid,22.485675,,,,regression,,,,modeling
Angola,maize flour,percent of population eating fortified vehicle,iron,22.485675,,,,regression,,,,modeling
Angola,maize flour,percent of population eating fortified vehicle,vitamin a,22.485675,,,,regression,,,,modeling
Angola,maize flour,percent of population eating fortified vehicle,zinc,22.485675,,,,regression,,,,modeling
Angola,maize flour,percent of population eating industrially produced vehicle,na,5.550861,,,,regression,,,,modeling
...,...,...,...,...,...,...,...,...,...,...,...,...
Sudan,wheat flour,percent of population eating fortified vehicle,iron,1.760469,,,,regression,,,,modeling
Sudan,wheat flour,percent of population eating fortified vehicle,vitamin a,1.760469,,,,regression,,,,modeling
Sudan,wheat flour,percent of population eating fortified vehicle,zinc,1.760469,,,,regression,,,,modeling
Sudan,wheat flour,percent of population eating industrially produced vehicle,na,35.365713,,,,regression,,,,modeling


In [128]:
output = output.reset_index()

In [129]:
output.loc[(output.location_name=="Vietnam"),'location_name'] = "Viet Nam"

In [130]:
sort_helper = {
    'percent of population eating fortified vehicle': 'C',
 'percent of population eating industrially produced vehicle': 'B',
 'percent of population eating vehicle': 'A'}

In [131]:
output['sort_helper'] = output.value_description.map(sort_helper)

In [132]:
output = output.sort_values(['location_name','vehicle','sort_helper','value_description','nutrient']).drop(columns='sort_helper')

In [133]:
save_path_tmp = '/ihme/homes/beatrixh/repos/scratch/tier5_coverage_data_03_29_2021.csv'
output.to_csv(save_path_tmp, index = False)

In [None]:
break

In [134]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/tier5_coverage_data_03_29_2021.csv'
output.to_csv(save_path, index = False)