In [1]:
from db_queries import get_population, get_ids
from db_queries import get_location_metadata as get_locs

In [2]:
import pandas as pd, numpy as np

# LSFF: choose population coverage data by hand for tier 4 countries

## vehicles: Wheat flour, maize flour, oil

## countries: Cameroon, Cote d'Ivoire, DRC, Indonesia, Mozambique

In [3]:
nutrients = ['iron','zinc','folic acid','vitamin a']

In [4]:
data_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/extraction_sheet_lsff_03_24_2021.2.csv'
assm_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/extraction_sheet_lsff_assumed_coverage_03_24_2021.csv'

df = pd.read_csv(data_path)

In [5]:
df.loc[df.nutrient=="folic acid, folate, b9",'nutrient']= 'folic acid'

In [6]:
assum = pd.read_csv(assm_path)

In [7]:
#these don't apply this time
assum.location_name.unique()

array(['Ethiopia', 'Myanmar', 'India'], dtype=object)

In [8]:
assum.loc[assum.nutrient=="folic acid, folate, b9",'nutrient']= 'folic acid'

In [9]:
df['estimation_status'] = 'na'

In [10]:
df['data_choice_notes'] = ""

In [11]:
mult_estimates_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_data_estimated_03_24_2021.csv'

mult_estimates = pd.read_csv(mult_estimates_path)

In [12]:
#reformat
mult_estimates.loc[(mult_estimates.B_estimate!=1.0),'B'] = np.nan
mult_estimates.loc[(mult_estimates.C_estimate!=1.0),'C'] = np.nan

mult_estimates = pd.melt(mult_estimates,
                         id_vars = ['location_name','vehicle','nutrient','standard'],
                         value_vars = ['B','C'], var_name = 'value_description', value_name = 'value_mean').dropna()

mult_estimates.loc[(mult_estimates.value_description=="B"),'nutrient'] = 'NA'

mult_estimates = mult_estimates.drop_duplicates()

mult_estimates.value_description = mult_estimates.value_description.map({
    'B':'percent of population eating industrially produced vehicle',
    'C':'percent of population eating fortified vehicle'
})

In [13]:
mult_estimates['estimation_status'] = 'multiplicative'

In [14]:
## load legal combos
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_vehicle_country_pairs.pickle', 'rb') as handle:
    vehicle_country_map = pickle.load(handle)

In [15]:
location_names = ['Cameroon',"Côte d'Ivoire",'Democratic Republic of the Congo','Indonesia','Mozambique']
vehicles = ['maize flour','wheat flour','oil']
nutrients = ['folic acid','iron','zinc','vitamin a']

In [16]:
# these are the vehicles per country we need
target_high_level = pd.DataFrame([(loc,v) for loc in location_names for v in country_vehicle_map[loc]],
            columns=['location_name','vehicle']).sort_values(['location_name','vehicle'])

target_high_level = target_high_level[target_high_level.vehicle.isin(vehicles)].set_index(['location_name','vehicle'])

target_high_level

location_name,vehicle
Cameroon,maize flour
Cameroon,oil
Cameroon,wheat flour
Côte d'Ivoire,maize flour
Côte d'Ivoire,oil
Côte d'Ivoire,wheat flour
Democratic Republic of the Congo,maize flour
Democratic Republic of the Congo,oil
Democratic Republic of the Congo,wheat flour
Indonesia,oil


In [17]:
target_a = pd.DataFrame([(loc,vehicle,nutrient,'percent of population eating fortified vehicle') for loc in location_names
                       for vehicle in country_vehicle_map[loc]
                      for nutrient in vehicle_nutrient_map[vehicle]],
            columns=['location_name','vehicle','nutrient','value_description']).sort_values(['location_name','vehicle','nutrient'])
target_a = target_a[(target_a.nutrient.isin(nutrients))]

target_b = pd.DataFrame([(loc,vehicle,'na',val) for loc in location_names
                       for vehicle in country_vehicle_map[loc]
                      for val in ['percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']],
            columns=['location_name','vehicle','nutrient','value_description'])

sortvars = ['location_name','vehicle','value_description','nutrient']
target = target_a.append(target_b)
target = target[(target.vehicle.isin(vehicles)) & (target.nutrient.isin(nutrients))].sort_values(sortvars).set_index(sortvars)

In [18]:
target

location_name,vehicle,value_description,nutrient
Cameroon,maize flour,percent of population eating fortified vehicle,folic acid
Cameroon,maize flour,percent of population eating fortified vehicle,iron
Cameroon,maize flour,percent of population eating fortified vehicle,vitamin a
Cameroon,maize flour,percent of population eating fortified vehicle,zinc
Cameroon,oil,percent of population eating fortified vehicle,vitamin a
Cameroon,wheat flour,percent of population eating fortified vehicle,folic acid
Cameroon,wheat flour,percent of population eating fortified vehicle,iron
Cameroon,wheat flour,percent of population eating fortified vehicle,vitamin a
Cameroon,wheat flour,percent of population eating fortified vehicle,zinc
Côte d'Ivoire,maize flour,percent of population eating fortified vehicle,folic acid


In [19]:
target = target.reset_index()
rcols = target.columns.tolist()

In [20]:
check_cols = ['location_id','location_name','urbanicity','subnational_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','sub_population','source_year','notes','source_citation','source_link','inclusion_justification','included','data_choice_notes']

def filter_data(country, vehicle, val):    
    output = df.loc[(df.location_name==country)
           & (df.vehicle==vehicle)
           & (df.value_description==val)
           & (df.value_mean.notna()),check_cols]
    
    return output

In [21]:
def check_one_country(country):
    vehicles = ['oil', 'wheat flour', 'salt', 'maize flour', 'rice', 'bouillon']
    values_gold = ['percent of population eating fortified vehicle',
               'percent of population eating industrially produced vehicle',
               'percent of population eating vehicle']
    return pd.concat([filter_data(country, vehicle, val) for vehicle in vehicles for val in values_gold])

In [22]:
usecols = ['location_id','location_name','subnational_name','vehicle','value_description','nutrient','value_mean', 'value_025_percentile',
       'value_975_percentile']
subset_data = {}

In [23]:
for i in location_names:
    subset_data[i] = pd.DataFrame()

In [24]:
location_names

['Cameroon',
 "Côte d'Ivoire",
 'Democratic Republic of the Congo',
 'Indonesia',
 'Mozambique']

## Cameroon

In [25]:
cameroon = check_one_country("Cameroon")

#nathaniel marked these for inclusion
cameroon.data_choice_notes = cameroon.inclusion_justification

cameroon[(cameroon.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
oil,percent of population eating vehicle,
wheat flour,percent of population eating vehicle,


In [26]:
mult_estimates[(mult_estimates.location_name=="Cameroon") & (mult_estimates.vehicle.isin(vehicles))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
52,Cameroon,wheat flour,,Mandatory,percent of population eating industrially prod...,75.570909,multiplicative
55,Cameroon,wheat flour,,Unknown,percent of population eating industrially prod...,75.570909,multiplicative
401,Cameroon,oil,vitamin a,Mandatory,percent of population eating fortified vehicle,30.4969,multiplicative
402,Cameroon,oil,vitamin d,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
403,Cameroon,wheat flour,folic acid,Mandatory,percent of population eating fortified vehicle,78.215891,multiplicative
404,Cameroon,wheat flour,iron,Mandatory,percent of population eating fortified vehicle,59.724634,multiplicative
406,Cameroon,wheat flour,vitamin b1,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
407,Cameroon,wheat flour,vitamin b12,Mandatory,percent of population eating fortified vehicle,78.215891,multiplicative
408,Cameroon,wheat flour,zinc,Mandatory,percent of population eating fortified vehicle,78.215891,multiplicative


In [27]:
subset_data['Cameroon'] = subset_data['Cameroon'].append(
    cameroon[(cameroon.vehicle=="oil") & (cameroon.included)]
)

cameroon[(cameroon.vehicle=="oil") & (cameroon.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
610,,Cameroon,mixed/both,na,oil,percent of population eating vehicle,na,53.1,na,na,under-5,2009,Under-5 defined as age 6-59 months. Proportion...,"Engle-Stone R, Ndjebayi AO, Nankap M, Brown KH...",https://watermark.silverchair.com/555.pdf?toke...,Keeping 53.1% for mixed/both urbanicity from E...,True,Keeping 53.1% for mixed/both urbanicity from E...


In [28]:
subset_data['Cameroon'] = subset_data['Cameroon'].append(
    mult_estimates[(mult_estimates.location_name=="Cameroon") 
               & (mult_estimates.vehicle=="oil") 
               & (mult_estimates.nutrient.isin(nutrients))]
)

mult_estimates[(mult_estimates.location_name=="Cameroon") 
               & (mult_estimates.vehicle=="oil") 
               & (mult_estimates.nutrient.isin(nutrients))]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
401,Cameroon,oil,vitamin a,Mandatory,percent of population eating fortified vehicle,30.4969,multiplicative


In [29]:
## need pct eating ind prod oil from regression

In [30]:
subset_data['Cameroon'] = subset_data['Cameroon'].append(
    cameroon[(cameroon.vehicle=="wheat flour") & (cameroon.included)]
)

cameroon[(cameroon.vehicle=="wheat flour") & (cameroon.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
575,,Cameroon,mixed/both,na,wheat flour,percent of population eating vehicle,na,92,90,94,women of reproductive age,2011,WRA age defined as age 16-45. Numbers from Fo...,"Hess SY, Brown KH, Sablah M, Engle-Stone R, Aa...",https://journals.sagepub.com/doi/pdf/10.1177/1...,Keeping value from Hess et al. (92%) for mixed...,True,Keeping value from Hess et al. (92%) for mixed...
577,,Cameroon,mixed/both,na,wheat flour,percent of population eating vehicle,na,94,92,96,under-5,2011,Under-5 defined as age 12-59 months. Numbers f...,"Hess SY, Brown KH, Sablah M, Engle-Stone R, Aa...",https://journals.sagepub.com/doi/pdf/10.1177/1...,Keeping value from Hess et al. (94%) from 2011...,True,Keeping value from Hess et al. (94%) from 2011...


In [31]:
subset_data['Cameroon'] = subset_data['Cameroon'].append(
    mult_estimates[(mult_estimates.location_name=="Cameroon") 
               & (mult_estimates.vehicle=="wheat flour") 
               & (mult_estimates.nutrient.isin(nutrients))]
)


mult_estimates[(mult_estimates.location_name=="Cameroon") 
               & (mult_estimates.vehicle=="wheat flour") 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
403,Cameroon,wheat flour,folic acid,Mandatory,percent of population eating fortified vehicle,78.215891,multiplicative
404,Cameroon,wheat flour,iron,Mandatory,percent of population eating fortified vehicle,59.724634,multiplicative
408,Cameroon,wheat flour,zinc,Mandatory,percent of population eating fortified vehicle,78.215891,multiplicative


In [32]:
## need pct eating ind prod wheat flour from regression

In [33]:
mult_estimates[(mult_estimates.location_name=="Cameroon") 
               & (mult_estimates.vehicle=="maize flour")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


In [34]:
## need pct eating ind prod wheat flour from regression

In [35]:
## need all maize flour vals from regression

## Cote d'Ivoire

In [36]:
civoire = check_one_country("Côte d'Ivoire")

#nathaniel marked these for inclusion
civoire.data_choice_notes = civoire.inclusion_justification

civoire[(civoire.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
oil,percent of population eating fortified vehicle,
oil,percent of population eating industrially produced vehicle,
oil,percent of population eating vehicle,
wheat flour,percent of population eating industrially produced vehicle,
wheat flour,percent of population eating vehicle,


In [37]:
subset_data["Côte d'Ivoire"] = subset_data["Côte d'Ivoire"].append(
    civoire[(civoire.vehicle=="oil") & (civoire.included)]
)

civoire[(civoire.vehicle=="oil") & (civoire.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
119,,Côte d'Ivoire,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,75.0,na,na,total population,2013,TABLE 2. Estimated coverage of vitamin A–forti...,"Sablah M, Klopp J, Steinberg D, Touaoro Z, Lai...",https://pubmed.ncbi.nlm.nih.gov/23444712/,Keeping Sablah et al. estimate of 75% from 201...,True,Keeping Sablah et al. estimate of 75% from 201...
825,,Côte d'Ivoire,urban,Abidjan,oil,percent of population eating industrially prod...,na,98.0,97,99,households with children <2 years old,2014,"These are 95% CIs, rather than 97.5! Note the ...","Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,Only one source,True,Only one source
824,,Côte d'Ivoire,urban,Abidjan,oil,percent of population eating vehicle,na,98.5,97.5,99.3,households with children <2 years old,2014,"These are 95% CIs, rather than 97.5! Note the ...","Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,Only one source,True,Only one source


In [38]:
subset_data["Côte d'Ivoire"] = subset_data["Côte d'Ivoire"].append(
    civoire[(civoire.vehicle=="wheat flour") & (civoire.included)]
)

civoire[(civoire.vehicle=="wheat flour") & (civoire.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
823,,Côte d'Ivoire,urban,Abidjan,wheat flour,percent of population eating industrially prod...,na,10.2,7.5,13.1,households with children <2 years old,2014,"These are 95% CIs, rather than 97.5! Note the ...","Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,Only one source,True,Only one source
822,,Côte d'Ivoire,urban,Abidjan,wheat flour,percent of population eating vehicle,na,54.7,50.1,59.6,households with children <2 years old,2014,"These are 95% CIs, rather than 97.5! Note the ...","Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,Only one source,True,Only one source


In [39]:
subset_data["Côte d'Ivoire"] = subset_data["Côte d'Ivoire"].append(
    mult_estimates[(mult_estimates.location_name=="Côte d'Ivoire") 
               & (mult_estimates.vehicle=="wheat flour") 
               & (mult_estimates.nutrient.isin(nutrients))]
)

mult_estimates[(mult_estimates.location_name=="Côte d'Ivoire") 
               & (mult_estimates.vehicle=="wheat flour") 
               & (mult_estimates.nutrient.isin(nutrients))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
429,Côte d'Ivoire,wheat flour,folic acid,Mandatory,percent of population eating fortified vehicle,39.10698,multiplicative
430,Côte d'Ivoire,wheat flour,iron,Mandatory,percent of population eating fortified vehicle,24.49124,multiplicative
434,Côte d'Ivoire,wheat flour,zinc,Unknown,percent of population eating fortified vehicle,0.0,multiplicative


In [40]:
mult_estimates[(mult_estimates.location_name=="Côte d'Ivoire") 
               & (mult_estimates.vehicle=="maize flour")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


In [41]:
# need all maize values from regression

## Democratic Republic of the Congo

In [42]:
drc = check_one_country("Democratic Republic of the Congo")

#nathaniel marked these for inclusion
drc.data_choice_notes = drc.inclusion_justification

drc[(drc.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
maize flour,percent of population eating vehicle,
oil,percent of population eating vehicle,
wheat flour,percent of population eating vehicle,


In [43]:
subset_data["Democratic Republic of the Congo"] = subset_data["Democratic Republic of the Congo"].append(
    drc[(drc.vehicle=="maize flour") & (drc.included)]
)

drc[(drc.vehicle=="maize flour") & (drc.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
208,,Democratic Republic of the Congo,rural,Turumbu,maize flour,percent of population eating vehicle,na,20.7,na,na,women,2009,"Described as ""village"" setting as opposed to ""...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates,True,One of five subnational estimates
209,,Democratic Republic of the Congo,rural,Kisangani,maize flour,percent of population eating vehicle,na,27.2,,,women,2009,"Described as ""village"" setting as opposed to ""...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates,True,One of five subnational estimates
210,,Democratic Republic of the Congo,urban,Kisangani,maize flour,percent of population eating vehicle,na,42.7,,,women,2009,"Asked about maize, not specifically maize flou...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates,True,One of five subnational estimates
211,,Democratic Republic of the Congo,mixed/both,Kongo Central,maize flour,percent of population eating vehicle,na,30.0,,,women of reproductive age,2014,women of reproductive age here = 15-49. Dietar...,"Moumin, N. A., Angel, M. D., Karakochuk, C. D....",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates,True,One of five subnational estimates
212,,Democratic Republic of the Congo,mixed/both,South Kivu,maize flour,percent of population eating vehicle,na,64.0,na,na,women of reproductive age,2014,women of reproductive age here = 15-49. Dietar...,"Moumin, N. A., Angel, M. D., Karakochuk, C. D....",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates,True,One of five subnational estimates
213,,Democratic Republic of the Congo,mixed/both,Kongo Central,maize flour,percent of population eating vehicle,na,31.0,,,under-5,2014,under-5 = 0.5-5 years. Dietary intake was asse...,"Moumin, N. A., Angel, M. D., Karakochuk, C. D....",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of three subnational estimates,True,One of three subnational estimates
214,,Democratic Republic of the Congo,mixed/both,South Kivu,maize flour,percent of population eating vehicle,na,53.0,na,na,under-5,2014,under-5 = 0.5-5 years. Dietary intake was asse...,"Moumin, N. A., Angel, M. D., Karakochuk, C. D....",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of three subnational estimates,True,One of three subnational estimates
215,,Democratic Republic of the Congo,rural,Bwamanda,maize flour,percent of population eating vehicle,na,93.44,90.19,95.7,under-5,1991,"The age group here is ""preschool"" the survey m...","Kismul, Hallgeir et al. “Diet and kwashiorkor:...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,One of three subnational estimates,True,One of three subnational estimates


In [44]:
subset_data["Democratic Republic of the Congo"] = subset_data["Democratic Republic of the Congo"].append(
    drc[(drc.vehicle=="oil") & (drc.included)]
)


drc.loc[(drc.vehicle=="oil") & (drc.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
205,,Democratic Republic of the Congo,rural,Turumbu,oil,percent of population eating vehicle,na,97.3,na,na,women,2009,"Described as ""village"" setting as opposed to ""...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates for either u...,True,One of five subnational estimates for either u...
206,,Democratic Republic of the Congo,rural,Kisangani,oil,percent of population eating vehicle,na,95.6,na,na,women,2009,"Described as ""village"" setting as opposed to ""...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates for either u...,True,One of five subnational estimates for either u...
207,,Democratic Republic of the Congo,urban,Kisangani,oil,percent of population eating vehicle,na,98.1,na,na,women,2009,"Asked about ""oil and fat"" = {vegetal oil + pal...","Termote, C., Meyi, M. B., Djailo, B. D. A., Hu...",https://www-ncbi-nlm-nih-gov.offcampus.lib.was...,One of five subnational estimates for either u...,True,One of five subnational estimates for either u...
217,,Democratic Republic of the Congo,rural,Bwamanda,oil,percent of population eating vehicle,na,88.41,81.1,92.12,under-5,1991,"The age group here is ""preschool"" the survey m...","Kismul, Hallgeir et al. “Diet and kwashiorkor:...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,One of five subnational estimates for either u...,True,One of five subnational estimates for either u...
218,,Democratic Republic of the Congo,rural,DRC Forest Concessions,oil,percent of population eating vehicle,na,100.0,na,na,total population,2012,This is specifally palm oil. The survey design...,"Donn, P., Nchuaji, T. E., Ngondi, J., Tieguhon...",https://www.researchgate.net/publication/28472...,One of five subnational estimates for either u...,True,One of five subnational estimates for either u...


In [45]:
subset_data["Democratic Republic of the Congo"] = subset_data["Democratic Republic of the Congo"].append(
    drc[(drc.vehicle=="wheat flour") & (drc.included)]
)

drc.loc[(drc.vehicle=="wheat flour") & (drc.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
216,,Democratic Republic of the Congo,rural,Bwamanda,wheat flour,percent of population eating vehicle,na,0.58,0,1.4,under-5,1991,"The age group here is ""preschool"" the survey m...","Kismul, Hallgeir et al. “Diet and kwashiorkor:...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Only one source. Note: This data is from a 198...,True,Only one source. Note: This data is from a 198...


In [46]:
subset_data["Democratic Republic of the Congo"] = subset_data["Democratic Republic of the Congo"].append(
    mult_estimates[(mult_estimates.location_name=="Democratic Republic of the Congo") 
               & (mult_estimates.nutrient.isin(nutrients + ['NA']))].drop(columns='standard').drop_duplicates()
)

mult_estimates[(mult_estimates.location_name=="Democratic Republic of the Congo") 
               & (mult_estimates.nutrient.isin(nutrients + ['NA']))]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
88,Democratic Republic of the Congo,maize flour,,Unknown,percent of population eating industrially prod...,2.26275,multiplicative
96,Democratic Republic of the Congo,wheat flour,,Voluntary,percent of population eating industrially prod...,0.58,multiplicative
99,Democratic Republic of the Congo,wheat flour,,Unknown,percent of population eating industrially prod...,0.58,multiplicative
439,Democratic Republic of the Congo,maize flour,folic acid,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
440,Democratic Republic of the Congo,maize flour,iron,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
444,Democratic Republic of the Congo,maize flour,zinc,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
445,Democratic Republic of the Congo,oil,vitamin a,Unknown,percent of population eating fortified vehicle,0.0,multiplicative
447,Democratic Republic of the Congo,wheat flour,folic acid,Voluntary,percent of population eating fortified vehicle,0.07975,multiplicative
448,Democratic Republic of the Congo,wheat flour,iron,Voluntary,percent of population eating fortified vehicle,0.07975,multiplicative
452,Democratic Republic of the Congo,wheat flour,zinc,Unknown,percent of population eating fortified vehicle,0.07975,multiplicative


In [47]:
# need ind prod oil
# vitamin a fort wheat
# vitamin a fort maize

## Indonesia

In [48]:
indonesia = check_one_country("Indonesia")

#nathaniel marked these for inclusion
indonesia.data_choice_notes = indonesia.inclusion_justification

indonesia[(indonesia.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
wheat flour,percent of population eating fortified vehicle,


In [49]:
subset_data['Indonesia'] = subset_data['Indonesia'].append(
    indonesia[(indonesia.vehicle=="wheat flour") & (indonesia.nutrient.isin(nutrients)) & (indonesia.included)]
)

indonesia[(indonesia.vehicle=="wheat flour") & (indonesia.nutrient.isin(nutrients)) & (indonesia.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
451,,Indonesia,mixed/both,na,wheat flour,percent of population eating fortified vehicle,folic acid,73.1,na,na,total population,2020,I have calculated the percentage from the numb...,,https://www.thejakartapost.com/academia/2020/0...,Only one source,True,Only one source
452,,Indonesia,mixed/both,na,wheat flour,percent of population eating fortified vehicle,iron,73.1,na,na,total population,2020,I have calculated the percentage from the numb...,,https://www.thejakartapost.com/academia/2020/0...,Only one source,True,Only one source
453,,Indonesia,mixed/both,na,wheat flour,percent of population eating fortified vehicle,zinc,73.1,na,na,total population,2020,I have calculated the percentage from the numb...,,https://www.thejakartapost.com/academia/2020/0...,Only one source,True,Only one source


In [50]:
mult_estimates[(mult_estimates.location_name=="Indonesia")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status


In [51]:
# need all oil vars
# need pct eating wheat and pct eating ind prod wheat

## Mozambique

In [52]:
moz = check_one_country("Mozambique")

#nathaniel marked these for inclusion
moz.data_choice_notes = moz.inclusion_justification

moz[(moz.vehicle.isin(vehicles))].groupby(['vehicle','value_description']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id
vehicle,value_description,Unnamed: 2_level_1
maize flour,percent of population eating vehicle,
oil,percent of population eating vehicle,
wheat flour,percent of population eating vehicle,


In [53]:
subset_data['Mozambique'] = subset_data['Mozambique'].append(
    moz[(moz.vehicle=="maize flour") & (moz.included)]
) 

moz[(moz.vehicle=="maize flour") & (moz.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
582,,Mozambique,mixed/both,na,maize flour,percent of population eating vehicle,na,79.9,na,na,total population,2013,Population coverage of food. Ministry of Healt...,GFDX,https://fortificationdata.org/country-fortific...,Only one source.,True,Only one source.


In [54]:
subset_data['Mozambique'] = subset_data['Mozambique'].append(
    moz[(moz.vehicle=="oil") & (moz.included)]
) 

moz[(moz.vehicle=="oil") & (moz.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
793,,Mozambique,mixed/both,na,oil,percent of population eating vehicle,na,95.2,na,na,total population,2013,Population coverage of food. Ministry of Healt...,GFDX,https://fortificationdata.org/country-fortific...,Keeping GFDx value (95.2%) for total populatio...,True,Keeping GFDx value (95.2%) for total populatio...


In [59]:
subset_data['Mozambique'] = subset_data['Mozambique'].append(
    moz[(moz.vehicle=="wheat flour") & (moz.included)]
) 

moz[(moz.vehicle=="wheat flour") & (moz.included)]

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,source_year,notes,source_citation,source_link,inclusion_justification,included,data_choice_notes
797,,Mozambique,mixed/both,na,wheat flour,percent of population eating vehicle,na,16.3,na,na,total population,2013,Population coverage of food. Ministry of Healt...,GFDX,https://fortificationdata.org/country-fortific...,"Keeping GFDx estimate (16.3%), as it is more r...",True,"Keeping GFDx estimate (16.3%), as it is more r..."


In [62]:
mult_estimates[(mult_estimates.location_name=="Mozambique")]

Unnamed: 0,location_name,vehicle,nutrient,standard,value_description,value_mean,estimation_status
206,Mozambique,maize flour,,Mandatory,percent of population eating industrially prod...,23.97,multiplicative
214,Mozambique,wheat flour,,Mandatory,percent of population eating industrially prod...,66.766667,multiplicative
557,Mozambique,maize flour,folic acid,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
558,Mozambique,maize flour,iron,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
559,Mozambique,maize flour,vitamin a,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
560,Mozambique,maize flour,vitamin b1,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
561,Mozambique,maize flour,vitamin b12,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
562,Mozambique,maize flour,zinc,Mandatory,percent of population eating fortified vehicle,36.3545,multiplicative
563,Mozambique,oil,vitamin a,Mandatory,percent of population eating fortified vehicle,37.5774,multiplicative
565,Mozambique,wheat flour,folic acid,Mandatory,percent of population eating fortified vehicle,40.06,multiplicative


## pop-weight subnationals

In [None]:
df[df.location_name.isin(location_names)].subnational_name.unique()

In [None]:
# 'Kitui', 'Vihiga' are relevant; 'Ougadougou', 'Gnagna' are from burkina faso, where we dont have to do any subnat weighting

In [None]:
loc_metadata = get_locs(location_set_id=35, gbd_round_id=6, decomp_step="step4")

#combine subnational estimates by population-weighting

subnats = loc_metadata[loc_metadata.location_name.isin(['Kitui', 'Vihiga', 'Ougadougou', 'Gnagna'])][['location_id','location_name','parent_id']]

In [None]:
subnats = subnats.rename(columns = {
    'location_id':'subnational_id',
    'location_name':'subnational_name'
})

subnat_pop = get_population(age_group_id=22, 
                     location_id=list(subnats.subnational_id),
                     year_id=2017,
                     sex_id=3,
                     gbd_round_id=6, 
                     decomp_step='step5')

subnats = subnats.merge(subnat_pop, left_on = 'subnational_id', right_on = 'location_id')[['subnational_id','subnational_name','population','parent_id']]

subnats['pop_denom'] = subnats.groupby('parent_id').transform('sum').population

subnats['subnat_pop_weight'] = subnats.population / subnats.pop_denom

In [None]:
subnats

In [None]:
tmp = subset_data['Kenya']

In [None]:
scols = rcols + ['source_link','source_citation','estimation_status','sub_population']

In [None]:
tmp.loc[tmp.source_citation.isna(),'source_citation'] = 'na'
tmp.loc[tmp.source_link.isna(),'source_link'] = 'na'
tmp.loc[tmp.estimation_status.isna(),'estimation_status'] = 'na'

tmp.loc[tmp.value_025_percentile=='na','value_025_percentile'] = np.nan
tmp.loc[tmp.value_975_percentile=='na','value_975_percentile'] = np.nan
tmp.loc[tmp.estimation_status.isna(),'estimation_status'] = 'na'

tmp['is_dupl'] = tmp.duplicated(subset = rcols, keep = False)
tmp = tmp.merge(subnats[['subnational_name','subnat_pop_weight']], how = 'left')

In [None]:
tmp.subnat_pop_weight = tmp.subnat_pop_weight.astype(float)
tmp.value_mean = tmp.value_mean.astype(float)
tmp.value_025_percentile = tmp.value_025_percentile.astype(float)
tmp.value_975_percentile = tmp.value_975_percentile.astype(float)

In [None]:
tmp.loc[tmp.is_dupl,'value_mean'] = tmp.loc[tmp.is_dupl].value_mean * tmp.loc[tmp.is_dupl].subnat_pop_weight
tmp.loc[tmp.is_dupl,'value_025_percentile'] = tmp.value_025_percentile * tmp.subnat_pop_weight
tmp.loc[tmp.is_dupl,'value_975_percentile'] = tmp.value_975_percentile * tmp.subnat_pop_weight

In [None]:
tmp.loc[~tmp.is_dupl,'subnational_name'] = 'na'

In [None]:
scols

In [None]:
tmp_a = tmp[~tmp.is_dupl]

In [None]:
tmp_b = tmp[tmp.is_dupl]

In [None]:
tmp_b.data_choice_notes.unique()

In [None]:
rcols

In [None]:
tmp_b.value_mean = tmp_b.groupby(scols).transform('sum').value_mean
tmp_b.value_025_percentile = tmp_b.groupby(scols).transform('sum').value_025_percentile
tmp_b.value_975_percentile = tmp_b.groupby(scols).transform('sum').value_975_percentile

In [None]:
tmp_b = tmp_b[['location_name','vehicle','value_description','nutrient','estimation_status','source_link','source_citation','source_year','sub_population','value_mean','value_025_percentile','value_975_percentile','data_choice_notes',]].drop_duplicates()

In [None]:
tmp_b.value_mean = tmp_b.value_mean.astype(float)
tmp_b.value_025_percentile = tmp_b.value_025_percentile.astype(float)
tmp_b.value_975_percentile = tmp_b.value_975_percentile.astype(float)

In [None]:
tmp_b = tmp_b.groupby(['location_name','vehicle','value_description','nutrient','source_link','source_citation','source_year',]).mean().reset_index()

In [None]:
tmp_b['data_choice_notes'] = "Excluded central bureau of stats total pop number from 2005 in lieu of Ferguson rural u5 number from 2015. Note 72% of kenyan population is rural according to the world bank: https://data.worldbank.org/indicator/SP.RUR.TOTL.ZS?locations=KE'. Population weighted rural subnationals."

In [None]:
tmp = tmp_b.append(tmp_a)

In [None]:
tmp

In [None]:
subset_data['Kenya'] = tmp

## Check for missingness

In [None]:
all_data = pd.concat(list(subset_data.values()))

In [None]:
all_data.loc[(all_data.location_name=="Kenya") & (all_data.vehicle=="oil"),["value_description",'value_mean','source_citation',"data_choice_notes"]]

In [None]:
all_data.data_choice_notes.unique()

In [None]:
all_data.loc[~(all_data.nutrient.isin(['vitamin a','iron','zinc','folic acid'])),'nutrient'] = 'na'

In [None]:
all_data[rcols + ['value_mean','value_025_percentile','value_975_percentile','sub_population']]

In [None]:
check = target.merge(all_data[rcols + ['value_mean']], on = rcols, how = 'left')

In [None]:
check[check.value_mean.isna()]

In [None]:
assert(len(check[check.value_mean.isna()])==0), "there are target loc/vehcile/val/nutrient combos youre missing"

In [None]:
rcols

In [None]:
output = all_data[rcols + ['value_mean','value_025_percentile', 'value_975_percentile','sub_population','estimation_status','source_citation','source_link','data_choice_notes']].sort_values(rcols).set_index(rcols)

In [None]:
##impute all missing CIs

# clean value_mean
output.loc[output.value_mean=='na','value_mean'] = np.nan
output.value_mean = output.value_mean.astype(float)

# clean 2.5th %ile
output.loc[output.value_025_percentile=='na','value_025_percentile'] = np.nan
output.value_025_percentile = output.value_025_percentile.astype(float)

# clean 97.5th %ile
output.loc[output.value_975_percentile=='na','value_975_percentile'] = np.nan
output.value_975_percentile = output.value_975_percentile.astype(float)

In [None]:
output.loc[(output.value_mean > output.value_975_percentile),'value_975_percentile'] = np.nan

In [None]:
output.loc[(output.value_mean < output.value_025_percentile)]

In [None]:
output.loc[(output.value_mean == output.value_025_percentile)]

In [None]:
output['scale_over_mean'] = (output.value_975_percentile - output.value_025_percentile) / output.value_mean

In [None]:
output = output.reset_index()

In [None]:
r = output.loc[(output.scale_over_mean!=np.inf),['vehicle','scale_over_mean']]
# .groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'}).reset_index()

In [None]:
r_mean = r.scale_over_mean.mean()

In [None]:
r.loc[r.vehicle=="maize flour",'scale_over_mean'] = r_mean

In [None]:
r = r.groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'}).reset_index()

In [None]:
# add uncertainty
output = output.merge(r, on = 'vehicle', how = 'outer')
output['lower'] = np.clip(output.value_mean - (output.r * output.value_mean)/2, 0, 100)
output['upper'] = np.clip(output.value_mean + (output.r * output.value_mean)/2, 0, 100)

In [None]:
output

In [None]:
output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna()),'CI_source'] = "modeling"
output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna()),'CI_source'] = "modeling"

output.loc[output.CI_source.isna(),'CI_source'] = 'extraction'

output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna()),'value_025_percentile'] = output.loc[(output.value_mean < output.value_025_percentile) | (output.value_025_percentile.isna())].lower
output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna()),'value_975_percentile'] = output.loc[(output.value_mean > output.value_975_percentile) | (output.value_975_percentile.isna())].upper

output = output.drop(columns=['r','lower','upper','scale_over_mean']).set_index(rcols)

output.loc[output.estimation_status.isna(),'estimation_status'] = 'na'

In [None]:
output

In [None]:
output = output.reset_index()

In [None]:
output.loc[(output.location_name=="Vietnam"),'location_name'] = "Viet Nam"

In [None]:
sort_helper = {
    'percent of population eating fortified vehicle': 'C',
 'percent of population eating industrially produced vehicle': 'B',
 'percent of population eating vehicle': 'A'}

In [None]:
output['sort_helper'] = output.value_description.map(sort_helper)

In [None]:
output = output.sort_values(['location_name','vehicle','sort_helper','value_description','nutrient']).drop(columns='sort_helper')

In [None]:
save_path_tmp = '/ihme/homes/beatrixh/repos/scratch/tier3_coverage_data_03_23_2021.csv'
output.to_csv(save_path_tmp, index = False)

In [None]:
break

In [None]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/population_coverage_data_tier3_locs_3_23_2021.csv'
output.to_csv(save_path, index = False)

In [None]:
test[(tset.)]

In [None]:
test = output.reset_index().copy()

In [None]:
test[(test.location_name=="Myanmar")]

In [None]:
test[(test.vehicle=="oil")].groupby(['location_name','vehicle','value_description','nutrient']).mean()

In [None]:
test[(test.vehicle=="wheat flour")].groupby(['location_name','vehicle','value_description']).mean()

In [None]:
test[(test.vehicle=="maize flour")].groupby(['location_name','vehicle','value_description']).mean()

In [None]:
test.columns

In [None]:
pd.pivot_table(values='value_mean', index = [''])

In [None]:
## TODO 
# check whats missing
# pop weight as necessary
# rerun the regressions --- make sure to fix burkina faso
# incorporate and format 