In [1]:
from db_queries import get_population, get_ids
from db_queries import get_location_metadata as get_locs

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

# Prep g/day for Tier 2 locs

## pakistan, bangladesh, tanzania, uganda, south africa

In [3]:
## load targets
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)

In [4]:
gday_path =  '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/gday_extraction_sheet_03_16_2021.csv'
gday = pd.read_csv(gday_path)

assert(len(gday[gday.location_name.isna()])==0), "Some rows missing location name"

In [5]:
output = pd.DataFrame()

In [6]:
location_names = ['Pakistan','Bangladesh','United Republic of Tanzania','Uganda','South Africa']
vehicles = ['wheat flour','maize flour','oil']

In [82]:
# these are the vehicles per country we need
target = pd.DataFrame([(loc,v) for loc in location_names for v in country_vehicle_map[loc]],
            columns=['location_name','vehicle']).sort_values(['location_name','vehicle']).set_index(['location_name','vehicle'])

target

location_name,vehicle
Bangladesh,oil
Bangladesh,wheat flour
Pakistan,oil
Pakistan,wheat flour
South Africa,maize flour
South Africa,oil
South Africa,wheat flour
Uganda,maize flour
Uganda,oil
Uganda,wheat flour


In [8]:
gday.location_id = gday.location_id.fillna(-1).astype(int)

In [9]:
gday.location_name.unique()

array(['Afghanistan', 'Angola', 'Bangladesh', 'Burkina Faso',
       "Côte d'Ivoire", 'Cameroon', 'Chad', 'China ',
       'Democratic Republic of the Congo', 'Egypt', 'Ethiopia', 'Ghana',
       'India', 'Indonesia', 'Kenya', 'Madagascar', 'Mozambique',
       'Myanmar', 'Nepal', 'Niger', 'Nigeria', 'Pakistan', 'Philippines',
       'South Africa', 'Sudan', 'Uganda', 'United Republic of Tanzania',
       'Viet Nam', 'Yemen', 'Zambia'], dtype=object)

In [10]:
# estimate CIs, crude

# clean value_mean
gday.loc[gday.value_mean=='na','value_mean'] = np.nan
gday.value_mean = gday.value_mean.astype(float)

# clean 2.5th %ile
gday.loc[gday.value_025_percentile=='na','value_025_percentile'] = np.nan
gday.value_025_percentile = gday.value_025_percentile.astype(float)

# clean 97.5th %ile
gday.loc[gday.value_975_percentile=='na','value_975_percentile'] = np.nan
gday.value_975_percentile = gday.value_975_percentile.astype(float)

# calc scale_over_mean
gday['scale_over_mean'] = (gday.value_975_percentile - gday.value_025_percentile) / gday.value_mean

#foreach vehicle, average scale_over_mean
r = gday[['vehicle','scale_over_mean']].groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'})

In [11]:
# these are the vehicles for which we have a scale_over_mean estimate
r

Unnamed: 0_level_0,r
vehicle,Unnamed: 1_level_1
bouillon,0.902591
wheat flour,0.66602


In [12]:
# for vehicles without a scale_over_mean value, we'll assign the average of the others (row-wise)
r = r.reset_index().append(pd.DataFrame([(i,gday.scale_over_mean.mean()) for i in ['maize flour', 'wheat(not specifically flour)','salt', 'rice']],
            columns = ['vehicle','r']))
r

Unnamed: 0,vehicle,r
0,bouillon,0.902591
1,wheat flour,0.66602
0,maize flour,0.806229
1,wheat(not specifically flour),0.806229
2,salt,0.806229
3,rice,0.806229


In [13]:
# add uncertainty
gday = gday.merge(r, on = 'vehicle', how = 'outer')
gday['lower'] = gday.value_mean - (gday.r * gday.value_mean)/2
gday['upper'] = gday.value_mean + (gday.r * gday.value_mean)/2

In [14]:
gday[(gday.location_name.isin(location_names)) & (gday.vehicle.isin(vehicles))].groupby(['location_name','vehicle']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id,subnational_location_id,value_mean,value_025_percentile,value_975_percentile,scale_over_mean,r,lower,upper
location_name,vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bangladesh,wheat flour,161,,54.2624,63.8,92.0,0.362003,0.66602,36.192483,72.332317
Pakistan,wheat flour,165,,289.800741,98.633333,99.933333,0.012644,0.66602,193.294223,386.307258
South Africa,maize flour,-1,,258.53,,,,0.806229,154.312826,362.747174
South Africa,wheat flour,-1,,139.68,,,,0.66602,93.165176,186.194824
Uganda,maize flour,190,,84.833333,,,,0.806229,50.635792,119.030874
Uganda,wheat flour,190,,18.635,,,,0.66602,12.42936,24.84064
United Republic of Tanzania,maize flour,189,,167.335,,,,0.806229,99.879847,234.790153
United Republic of Tanzania,wheat flour,189,,24.98,,,,0.66602,16.661413,33.298587


In [15]:
## dicts for var cleaning

value_d_to_metric = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'median',
 'Mean per capita consumption (g/day)': 'mean',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'mean',
 'Mean micronutrient intake per capita (mg/day)': 'mean',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK'
}

value_d_to_entity = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean per capita consumption (g/day)': 'CHECK',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean micronutrient intake per capita (mg/day)': 'nutrient',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK'
}

value_d_to_mass_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean per capita consumption (g/day)': 'g',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean micronutrient intake per capita (mg/day)': 'mg',
 'Daily per capita consumption (g)': 'g',
 'Daily consumption (mg/d)': 'mg',
 'Consumption per person per day (g)': 'g',
 'Estimated daily contribution from fortified foods (mg/d)': 'mg',
 'kg/capita/year': 'kg'
}

value_d_to_time_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean per capita consumption (g/day)': 'day',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean micronutrient intake per capita (mg/day)': 'day',
 'Daily per capita consumption (g)': 'day',
 'Daily consumption (mg/d)': 'day',
 'Consumption per person per day (g)': 'day',
 'Estimated daily contribution from fortified foods (mg/d)': 'day',
 'kg/capita/year': 'year'
}

value_d_to_population = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean per capita consumption (g/day)': 'capita CHECK',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean micronutrient intake per capita (mg/day)': 'capita CHECK',
 'Daily per capita consumption (g)': 'capita CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK'
}

In [16]:
def format_value_d(df):
    df['metric'] = df.value_description.map(value_d_to_metric)
    df['entity'] = df.value_description.map(value_d_to_entity)
    df['mass_unit'] = df.value_description.map(value_d_to_mass_unit)
    df['time_unit'] = df.value_description.map(value_d_to_time_unit)
    df['pop_denom'] = df.value_description.map(value_d_to_population)
    
    return df

## Pakistan

In [17]:
pakistan = gday[gday.location_name=="Pakistan"]
pakistan = format_value_d(pakistan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [18]:
viewcols = ['location_name','subnational_name', 'metric', 'entity',
            'mass_unit', 'time_unit', 'pop_denom','vehicle',
            'value_mean','value_025_percentile','value_975_percentile',
            'sub_population','urbanicity',
            'source_citation', 'source_link', 'source_year', 'source_type']

In [19]:
for i in pakistan[(pakistan.sub_population=="women of reproductive age")].source_link:
    print(i)

https://www.gainhealth.org/sites/default/files/publications/documents/pakistan-fact-survey-2017-final-report-aug2018-corrected.pdf
https://www.gainhealth.org/sites/default/files/publications/documents/pakistan-fact-survey-2017-final-report-aug2018-corrected.pdf
https://www.gainhealth.org/sites/default/files/publications/documents/pakistan-fact-survey-2017-final-report-aug2018-corrected.pdf


In [20]:
pakistan[['location_name','metric','entity','mass_unit','time_unit']]

Unnamed: 0,location_name,metric,entity,mass_unit,time_unit
80,Pakistan,CHECK,CHECK,g,day
81,Pakistan,CHECK,CHECK,g,day
82,Pakistan,CHECK,CHECK,g,day
83,Pakistan,mean,CHECK,g,day
84,Pakistan,mean,CHECK,g,day
164,Pakistan,mean,CHECK,g,day
165,Pakistan,mean,CHECK,g,day
166,Pakistan,mean,CHECK,g,day
167,Pakistan,mean,CHECK,g,day
168,Pakistan,mean,CHECK,g,day


In [21]:
pakistan[pakistan.sub_population=="women of reproductive age"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,CI validated,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom
80,165,Pakistan,Balochistan,,unknown,wheat flour,na,Consumption per person per day (g),105.2,104.1,...,yes,0.020913,0.66602,70.167358,140.232642,CHECK,CHECK,g,day,CHECK
81,165,Pakistan,Punjab,,unknown,wheat flour,na,Consumption per person per day (g),66.7,66.4,...,yes,0.007496,0.66602,44.488239,88.911761,CHECK,CHECK,g,day,CHECK
82,165,Pakistan,Sindh,,unknown,wheat flour,na,Consumption per person per day (g),126.0,125.4,...,yes,0.009524,0.66602,84.040752,167.959248,CHECK,CHECK,g,day,CHECK


In [22]:
# for pakistan/wheat, choose Chose 2017 FACT survey data from WRA over GFDx total pop data\
output = output.append(pakistan[pakistan.sub_population=="women of reproductive age"])

In [23]:
location_names

['Pakistan',
 'Bangladesh',
 'United Republic of Tanzania',
 'Uganda',
 'South Africa']

## Bangladesh

In [24]:
bangladesh = gday[gday.location_name=="Bangladesh"]
bangladesh = format_value_d(bangladesh)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [25]:
bangladesh.sort_values(['vehicle','nutrient','source_citation','source_year'])[['sub_population','urbanicity','value_mean','source_citation','source_year']]

Unnamed: 0,sub_population,urbanicity,value_mean,source_citation,source_year
186,total population,unknown,52.0,GFDx,1995
187,total population,unknown,54.0,GFDx,1996
188,total population,unknown,66.0,GFDx,1997
189,total population,unknown,61.0,GFDx,1998
190,total population,unknown,81.0,GFDx,1999
191,total population,unknown,66.0,GFDx,2000
192,total population,unknown,52.0,GFDx,2001
193,total population,unknown,61.0,GFDx,2002
194,total population,unknown,58.0,GFDx,2003
195,total population,unknown,60.0,GFDx,2004


In [26]:
bangladesh.sort_values(['vehicle','nutrient','source_citation','source_year'])[viewcols]

Unnamed: 0,location_name,subnational_name,metric,entity,mass_unit,time_unit,pop_denom,vehicle,value_mean,value_025_percentile,value_975_percentile,sub_population,urbanicity,source_citation,source_link,source_year,source_type
186,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,52.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,1995,
187,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,54.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,1996,
188,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,66.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,1997,
189,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,61.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,1998,
190,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,81.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,1999,
191,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,66.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,2000,
192,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,52.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,2001,
193,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,61.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,2002,
194,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,58.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,2003,
195,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,60.0,,,total population,unknown,GFDx,https://fortificationdata.org/full-gfdx-datasets/,2004,


In [27]:
for i in bangladesh[bangladesh.sub_population=="women of reproductive age"].source_link:
    print(i)

https://www.mdpi.com/2072-6643/8/9/541/htm


In [28]:
bangladesh[(bangladesh.sub_population=="women of reproductive age") & (bangladesh.vehicle=="wheat flour")][viewcols]

Unnamed: 0,location_name,subnational_name,metric,entity,mass_unit,time_unit,pop_denom,vehicle,value_mean,value_025_percentile,value_975_percentile,sub_population,urbanicity,source_citation,source_link,source_year,source_type
9,Bangladesh,na,mean,CHECK,g,day,capita CHECK,wheat flour,77.9,63.8,92.0,women of reproductive age,mixed/both,"Leyvraz M, Laillou A, Rahman S, et al. An Asse...",https://www.mdpi.com/2072-6643/8/9/541/htm,2011,Survey - cross-sectional


In [29]:
#Chose data from The National Micronutrients Status Survey December 2011 (78 g/day),
# which sampled WRA, over GFDx total pop data from 2017 (49.5 g.day). The GFDx 
# timeseries estimated from 52 gday in 1995, going up to 50-60 gday through 2006, and 
# back down into around 45gday from 2007-2017. Due to lack in fluctuation, taking survey data.

output = output.append(bangladesh[(bangladesh.sub_population=="women of reproductive age") & (bangladesh.vehicle=="wheat flour")])

In [30]:
checkcols = ['vehicle','nutrient','sub_population','urbanicity','value_mean','source_citation','source_year']

## United Republic of Tanzania

In [31]:
tanz = gday[gday.location_name=="United Republic of Tanzania"]
tanz = format_value_d(tanz)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [32]:
tanz.sort_values(['vehicle'])[checkcols]

Unnamed: 0,vehicle,nutrient,sub_population,urbanicity,value_mean,source_citation,source_year
249,maize flour,na,total population,unknown,172.47,GFDx,2017
250,maize flour,na,total population,,162.2,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007
91,wheat flour,na,total population,unknown,17.0,,1997-2000
92,wheat flour,na,total population,unknown,32.96,GFDx,2019
310,wheat(not specifically flour),na,total population,,35.4,"Mason, N. M., Jayne, T. S., & Shiferaw, B. A. ...",2000-2009


In [33]:
for i in tanz.sort_values(['vehicle']).source_link:
    print(i)

https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=TZA&lang=en
https://doi.org/10.3945/an.110.000182
https://www.nutritionintl.org/content/user_files/2017/06/Fort_handbook1NDB-3242008-2608.pdf
https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=TZA&lang=en
https://ageconsearch.umn.edu/record/146936/files/idwp127.pdf


In [34]:
output = output.append(tanz[(tanz.vehicle=="maize flour") & (tanz.source_citation=="GFDx")])

In [35]:
tanz[(tanz.vehicle=="wheat flour") & (tanz.source_year=='2019')]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,CI validated,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom
92,189,United Republic of Tanzania,na,,unknown,wheat flour,na,Consumption per person per day (g),32.96,,...,,,0.66602,21.983994,43.936006,CHECK,CHECK,g,day,CHECK


In [36]:
output = output.append(tanz[(tanz.vehicle=="wheat flour") & (tanz.source_year=='2019')])

## Uganda

In [37]:
uganda = gday[gday.location_name=="Uganda"]
uganda = format_value_d(uganda)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [38]:
uganda.sort_values(['vehicle'])[checkcols]

Unnamed: 0,vehicle,nutrient,sub_population,urbanicity,value_mean,source_citation,source_year
246,maize flour,na,total population,unknown,61.0,,1997-2000
247,maize flour,na,total population,unknown,126.3,GFDx,2017
248,maize flour,na,total population,,67.2,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007
89,wheat flour,na,total population,unknown,7.0,,1997-2000
90,wheat flour,na,total population,unknown,30.27,GFDx,2017
309,wheat(not specifically flour),na,total population,,23.78,"Mason, N. M., Jayne, T. S., & Shiferaw, B. A. ...",2000-2009


In [39]:
for i in uganda.sort_values(['vehicle']).source_link:
    print(i)

https://www.nutritionintl.org/content/user_files/2017/06/Fort_handbook1NDB-3242008-2608.pdf
https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=UGA&lang=en
https://doi.org/10.3945/an.110.000182
https://www.nutritionintl.org/content/user_files/2017/06/Fort_handbook1NDB-3242008-2608.pdf
https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=UGA&lang=en
https://ageconsearch.umn.edu/record/146936/files/idwp127.pdf


In [40]:
# Chose 2017 GFDx estimate (126 g/day) over number from 2007 paper (67 g/day) 
# that estimated from FAO balance sheets, and a 1997-2000 report that estimated from FAO balance sheets

output = output.append(uganda[(uganda.vehicle=="maize flour") & (uganda.source_citation=="GFDx")])

In [41]:
# Chose 2017 GFDx estimate (30 g/day) over number from 2000-2009 paper (24 g/day) 
# that estimated wheat (not specifically wheat flour), and a 1997-2000 report that estimated (7 g/day) from FAO balance sheets

output = output.append(uganda[(uganda.vehicle=="wheat flour") & (uganda.source_citation=="GFDx")])

## South Africa

In [42]:
safrica = gday[gday.location_name=="South Africa"]
safrica = format_value_d(safrica)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [43]:
safrica.sort_values(['vehicle'])[checkcols]

Unnamed: 0,vehicle,nutrient,sub_population,urbanicity,value_mean,source_citation,source_year
241,maize flour,na,total population,unknown,204.0,,1997-2000
242,maize flour,na,total population,unknown,283.29,GFDx,2017
243,maize flour,na,total population,,288.3,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007
85,wheat flour,na,total population,unknown,123.0,,1997-2000
86,wheat flour,na,total population,unknown,156.36,GFDx,2017
307,wheat(not specifically flour),na,total population,,163.42,"Mason, N. M., Jayne, T. S., & Shiferaw, B. A. ...",2000-2009


In [44]:
for i in safrica.sort_values(['vehicle']).source_link:
    print(i)

https://www.nutritionintl.org/content/user_files/2017/06/Fort_handbook1NDB-3242008-2608.pdf
https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=ZAF&lang=en
https://doi.org/10.3945/an.110.000182
https://www.nutritionintl.org/content/user_files/2017/06/Fort_handbook1NDB-3242008-2608.pdf
https://fortificationdata.org/country-fortification-dashboard/?alpha3_code=ZAF&lang=en
https://ageconsearch.umn.edu/record/146936/files/idwp127.pdf


In [45]:
# Chose 2017 GFDx estimte (283 g/day) over a 2007 paper that estimated (288 g/day)
# from FAO balance sheets and a 1997-2000 paper that estimated (204 g/day) from FAO balance sheets

output = output.append(safrica[(safrica.vehicle=="maize flour") & (safrica.source_citation=="GFDx")])

In [46]:
# Chose 2017 GFDx estimate (156 g/day) over number from 2000-2009 paper (163 g/day) 
# that estimated wheat (not specifically wheat flour), and a 1997-2000 report that estimated (123 g/day) from FAO balance sheets

output = output.append(safrica[(safrica.vehicle=="wheat flour") & (safrica.source_citation=="GFDx")])

# Format output

In [47]:
loc_metadata = get_locs(location_set_id=35, gbd_round_id=6, decomp_step="step4")

In [48]:
output = output.drop(columns = 'location_id')
output = loc_metadata[['location_id','location_name']].merge(output, on = 'location_name', how = 'right')

In [57]:
metric_map = {
    'mean':'mean',
    'CHECK':'mean'
}

pop_denom_map = {
    'capita':'capita',
    'CHECK':'capita',
    'capita CHECK':'capita'
}

output.metric = output.metric.map(metric_map)
output.pop_denom = output.pop_denom.map(pop_denom_map)

In [51]:
loc_metadata.loc[(loc_metadata.parent_id==165),['location_id','location_name']]

Unnamed: 0,location_id,location_name
733,53615,Azad Jammu & Kashmir
734,53616,Balochistan
735,53617,Gilgit-Baltistan
736,53618,Islamabad Capital Territory
737,53619,Khyber Pakhtunkhwa
738,53620,Punjab
739,53621,Sindh


In [52]:
pakistan_subnat_ids = {
    'Balochistan':53616,
    'Punjab':53620,
    'Sindh':53621
}

pop = get_population(age_group_id=22, 
                     location_id=list(pakistan_subnat_ids.values()),
                     year_id=2017,
                     sex_id=3,
                     gbd_round_id=6, 
                     decomp_step='step5')

In [None]:
# population weight subnationals to get national
output['subnational_id'] = output.subnational_name.map(pakistan_subnat_ids)

output = output.merge(pop[['location_id','population']].rename(columns={'location_id':'subnational_id'}), on = 'subnational_id', how = 'left')

group_cols = ['location_id','location_name','sub_population','vehicle','metric','mass_unit','time_unit','pop_denom']

output['subnat_pop_denom'] = output.groupby(group_cols).transform(sum).population

output['subnat_pop_weight'] = [(i/j) if j!=0.0 else 1 for (i,j) in zip(output.population,output.subnat_pop_denom)]

In [65]:
output = output[group_cols + ['value_mean','lower','upper']].groupby(group_cols).sum()

In [66]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,value_mean,lower,upper
location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
161,Bangladesh,women of reproductive age,wheat flour,mean,g,day,capita,77.9,51.958528,103.841472
165,Pakistan,women of reproductive age,wheat flour,mean,g,day,capita,297.9,198.696349,397.103651
189,United Republic of Tanzania,total population,maize flour,mean,g,day,capita,172.47,102.944854,241.995146
189,United Republic of Tanzania,total population,wheat flour,mean,g,day,capita,32.96,21.983994,43.936006
190,Uganda,total population,maize flour,mean,g,day,capita,126.3,75.386647,177.213353
190,Uganda,total population,wheat flour,mean,g,day,capita,30.27,20.18979,40.35021
196,South Africa,total population,maize flour,mean,g,day,capita,283.29,169.091713,397.488287
196,South Africa,total population,wheat flour,mean,g,day,capita,156.36,104.290571,208.429429


In [71]:
assert([output.value_mean <= output.upper]), "mean not lower than upper"
assert([output.value_mean >= output.lower]), "mean not higher than lower"

## check for missing values


In [73]:
## load legal combos
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_vehicle_country_pairs.pickle', 'rb') as handle:
    vehicle_country_map = pickle.load(handle)

In [74]:
target_vehicles = [i for i in vehicle_nutrient_map.keys() if 'iron' in vehicle_nutrient_map[i]]
set([i for j in location_names for i in country_vehicle_map[j]])

{'maize flour', 'oil', 'wheat flour'}

In [85]:
# make sure nothing missing. note we're not interested in oil/iron
output.reset_index().merge(target.reset_index(), on = ['location_name','vehicle'], how = 'outer').set_index(group_cols)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,value_mean,lower,upper
location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
161.0,Bangladesh,women of reproductive age,wheat flour,mean,g,day,capita,77.9,51.958528,103.841472
165.0,Pakistan,women of reproductive age,wheat flour,mean,g,day,capita,297.9,198.696349,397.103651
189.0,United Republic of Tanzania,total population,maize flour,mean,g,day,capita,172.47,102.944854,241.995146
189.0,United Republic of Tanzania,total population,wheat flour,mean,g,day,capita,32.96,21.983994,43.936006
190.0,Uganda,total population,maize flour,mean,g,day,capita,126.3,75.386647,177.213353
190.0,Uganda,total population,wheat flour,mean,g,day,capita,30.27,20.18979,40.35021
196.0,South Africa,total population,maize flour,mean,g,day,capita,283.29,169.091713,397.488287
196.0,South Africa,total population,wheat flour,mean,g,day,capita,156.36,104.290571,208.429429
,Bangladesh,,oil,,,,,,,
,Pakistan,,oil,,,,,,,


In [None]:
save_path = '/ihme/homes/beatrixh/vivarium_data_analysis/pre_processing/lsff_project/data_prep/outputs/gday_nigeria_ethiopia_india_02_24_2021.csv'
output.to_csv(save_path)