In [1]:
from db_queries import get_population, get_ids
from db_queries import get_location_metadata as get_locs

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

# Prep g/day for Tier 5 locs

## 	Angola, China, Ghana, Niger, Egypt, Sudan, Madagascar

In [3]:
## load targets
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)

In [4]:
ls /ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/ | grep gday_extraction_

gday_extraction_sheet_02_19_2021.csv
gday_extraction_sheet_02_22_2021.csv
gday_extraction_sheet_03_16_2021.csv
gday_extraction_sheet_03_22_2021.csv
gday_extraction_sheet_03_24_2021.csv
gday_extraction_sheet_03_29_2021.csv


In [5]:
gday_path =  '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/gday_extraction_sheet_03_29_2021.csv'
gday = pd.read_csv(gday_path)

assert(len(gday[gday.location_name.isna()])==0), "Some rows missing location name"

In [6]:
gday.location_name = gday.location_name.str.strip(' ')
gday.subnational_name = gday.subnational_name.str.strip(' ')
gday.vehicle = gday.vehicle.str.strip(' ')
gday.nutrient = gday.nutrient.str.strip(' ')
gday.urbanicity = gday.urbanicity.str.strip(' ')
gday.nutrient = gday.nutrient.str.strip(' ')

In [7]:
output = pd.DataFrame()

In [8]:
country_vehicle_map.keys()

dict_keys(['India', 'Nigeria', 'Ethiopia', 'Democratic Republic of the Congo', 'Indonesia', 'Bangladesh', 'Pakistan', 'Kenya', 'United Republic of Tanzania', 'South Africa', 'Sudan', 'Uganda', 'Myanmar', 'Ghana', 'Egypt', 'Vietnam', 'Nepal', 'Mozambique', 'Cameroon', 'Angola', "Côte d'Ivoire", 'Madagascar', 'Burkina Faso', 'Niger', 'China', nan])

In [9]:
# location_names = ['Pakistan','Bangladesh','United Republic of Tanzania','Uganda','South Africa']
# location_names = ['Kenya', 'Burkina Faso', 'Myanmar', 'Vietnam', 'Nepal']

location_names = ['Angola', 'China', 'Ghana', 'Niger', 'Egypt', 'Sudan', 'Madagascar']
vehicles = ['wheat flour','maize flour','oil']

In [10]:
# these are the vehicles per country we need
target = pd.DataFrame([(loc,v) for loc in location_names for v in country_vehicle_map[loc]],
            columns=['location_name','vehicle']).sort_values(['location_name','vehicle']).set_index(['location_name','vehicle'])

target

location_name,vehicle
Angola,maize flour
Angola,oil
Angola,wheat flour
China,oil
China,wheat flour
Egypt,maize flour
Egypt,oil
Egypt,wheat flour
Ghana,bouillon
Ghana,maize flour


In [11]:
gday.location_id = gday.location_id.fillna(-1).astype(int)

In [12]:
gday.location_name.unique()

array(['Afghanistan', 'Angola', 'Bangladesh', 'Burkina Faso',
       "Côte d'Ivoire", 'Cameroon', 'Chad', 'China',
       'Democratic Republic of the Congo', 'Egypt', 'Ethiopia', 'Ghana',
       'India', 'Indonesia', 'Kenya', 'Madagascar', 'Mozambique',
       'Myanmar', 'Nepal', 'Niger', 'Nigeria', 'Pakistan', 'Philippines',
       'South Africa', 'Sudan', 'Uganda', 'United Republic of Tanzania',
       'Viet Nam', 'Yemen', 'Zambia'], dtype=object)

In [13]:
# estimate CIs, crude

# clean value_mean
gday.loc[gday.value_mean=='na','value_mean'] = np.nan
gday.value_mean = gday.value_mean.astype(float)

# clean 2.5th %ile
gday.loc[gday.value_025_percentile=='na','value_025_percentile'] = np.nan
gday.value_025_percentile = gday.value_025_percentile.astype(float)

# clean 97.5th %ile
gday.loc[gday.value_975_percentile=='na','value_975_percentile'] = np.nan
gday.value_975_percentile = gday.value_975_percentile.astype(float)

# calc scale_over_mean
gday['scale_over_mean'] = (gday.value_975_percentile - gday.value_025_percentile) / gday.value_mean

#foreach vehicle, average scale_over_mean
r = gday[['vehicle','scale_over_mean']].groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'})

In [14]:
# these are the vehicles for which we have a scale_over_mean estimate
r

Unnamed: 0_level_0,r
vehicle,Unnamed: 1_level_1
bouillon,0.902591
wheat flour,0.699893


In [15]:
# for vehicles without a scale_over_mean value, we'll assign the average of the others (row-wise)
r = r.reset_index().append(pd.DataFrame([(i,gday.scale_over_mean.mean()) for i in ['maize flour', 'wheat(not specifically flour)','salt', 'rice']],
            columns = ['vehicle','r']))
r

Unnamed: 0,vehicle,r
0,bouillon,0.902591
1,wheat flour,0.699893
0,maize flour,0.819483
1,wheat(not specifically flour),0.819483
2,salt,0.819483
3,rice,0.819483


In [16]:
# add uncertainty
gday = gday.merge(r, on = 'vehicle', how = 'outer')
gday['lower'] = gday.value_mean - (gday.r * gday.value_mean)/2
gday['upper'] = gday.value_mean + (gday.r * gday.value_mean)/2

In [17]:
gday[(gday.location_name.isin(location_names)) & (gday.vehicle.isin(vehicles))].groupby(['location_name','vehicle']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,location_id,subnational_location_id,value_mean,value_025_percentile,value_975_percentile,scale_over_mean,r,lower,upper
location_name,vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Angola,maize flour,168,,107.4864,,,,0.819483,63.444738,151.528062
Angola,wheat flour,168,,81.020833,,,,0.699893,52.667893,109.373774
China,maize flour,-1,,38.0,,,,0.819483,22.429815,53.570185
China,wheat flour,-1,,181.056429,,,,0.699893,117.696402,244.416455
Egypt,maize flour,-1,,163.435833,,,,0.819483,96.469355,230.402312
Egypt,wheat flour,-1,,389.919583,,,,0.699893,253.46867,526.370496
Ghana,maize flour,207,,65.1448,,,,0.819483,38.452258,91.837342
Ghana,wheat flour,207,,39.4832,,,,0.699893,25.6662,53.3002
Madagascar,maize flour,181,,40.117826,,,,0.819483,23.679879,56.555773
Madagascar,wheat flour,181,,20.330417,,,,0.699893,13.215863,27.444971


In [18]:
gday.value_description.unique()

array([nan, 'Mean daily consumption (mg)',
       'Mean per capita consumption (g/day)',
       'Median amount of vehicle consumed on previous day among consumers (g/day)',
       'Mean amount of vehicle consumed on previous day among consumers (g/day)',
       'Estimated daily contribution from fortified foods (mg/d)',
       'kg/capita/year',
       'Mean per capita consumption among consumers (g/day)',
       'Consumption per person per day (g)',
       'Median daily contribution from fortified foods among consumers (mg/day)'],
      dtype=object)

In [19]:
## dicts for var cleaning

value_d_to_metric = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'median',
 'Mean per capita consumption (g/day)': 'mean',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'mean',
 'Mean micronutrient intake per capita (mg/day)': 'mean',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK',
 'Mean per capita consumption among consumers (g/day)':'mean'
}

value_d_to_entity = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean per capita consumption (g/day)': 'CHECK',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean micronutrient intake per capita (mg/day)': 'nutrient',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK',
 'Mean per capita consumption among consumers (g/day)':'CHECK'
}

value_d_to_mass_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean per capita consumption (g/day)': 'g',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean micronutrient intake per capita (mg/day)': 'mg',
 'Daily per capita consumption (g)': 'g',
 'Daily consumption (mg/d)': 'mg',
 'Consumption per person per day (g)': 'g',
 'Estimated daily contribution from fortified foods (mg/d)': 'mg',
 'kg/capita/year': 'kg',
 'Mean per capita consumption among consumers (g/day)':'g'
}

value_d_to_time_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean per capita consumption (g/day)': 'day',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean micronutrient intake per capita (mg/day)': 'day',
 'Daily per capita consumption (g)': 'day',
 'Daily consumption (mg/d)': 'day',
 'Consumption per person per day (g)': 'day',
 'Estimated daily contribution from fortified foods (mg/d)': 'day',
 'kg/capita/year': 'year',
 'Mean per capita consumption among consumers (g/day)':'day'
}

value_d_to_population = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean per capita consumption (g/day)': 'capita CHECK',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean micronutrient intake per capita (mg/day)': 'capita CHECK',
 'Daily per capita consumption (g)': 'capita CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'capita CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'capita CHECK',
 'Mean per capita consumption among consumers (g/day)':'consumers'
}

In [20]:
def format_value_d(df):
    df['metric'] = df.value_description.map(value_d_to_metric)
    df['entity'] = df.value_description.map(value_d_to_entity)
    df['mass_unit'] = df.value_description.map(value_d_to_mass_unit)
    df['time_unit'] = df.value_description.map(value_d_to_time_unit)
    df['pop_denom'] = df.value_description.map(value_d_to_population)
    
    return df

In [21]:
location_names

['Angola', 'China', 'Ghana', 'Niger', 'Egypt', 'Sudan', 'Madagascar']

In [22]:
viewcols = ['location_name','subnational_name', 'metric', 'entity',
            'mass_unit', 'time_unit', 'pop_denom','vehicle',
            'value_mean','value_025_percentile','value_975_percentile',
            'sub_population','urbanicity',
            'source_citation', 'source_link', 'source_year', 'source_type']

In [23]:
qcols = ['value_mean','source_citation','source_year','sub_population','source_type']

## Angola

In [24]:
angola = format_value_d(gday[gday.location_name=="Angola"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [25]:
angola.vehicle.unique()

array(['wheat flour', 'maize flour', 'wheat(not specifically flour)'],
      dtype=object)

In [26]:
angola_wheat = angola[(angola.vehicle=="wheat flour")]
angola_maize = angola[(angola.vehicle=="maize flour")]

In [27]:
angola_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
7,49.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
8,89.32,GFDx,2017,total population,estimated from FAO balance sheets
472,58.0,GFDx,1995,total population,estimated from FAO balance sheets
473,61.0,GFDx,1996,total population,estimated from FAO balance sheets
474,63.0,GFDx,1997,total population,estimated from FAO balance sheets
475,62.0,GFDx,1998,total population,estimated from FAO balance sheets
476,63.0,GFDx,1999,total population,estimated from FAO balance sheets
477,71.0,GFDx,2000,total population,estimated from FAO balance sheets
478,66.0,GFDx,2001,total population,estimated from FAO balance sheets
479,67.0,GFDx,2002,total population,estimated from FAO balance sheets


In [28]:
angola_wheat.loc[(angola_wheat.source_year=="2017"),'data_choice_notes'] = "Discarded 1997-2000 fortification handbook (49 g/day) in lieu of 2017 gfdx number (89 g/day). Noting gfdx had 63-71 g/day 1997-2000. All estimates from FAO"

output = output.append(
    angola_wheat.loc[(angola_wheat.source_year=="2017")]
)

angola_wheat.loc[(angola_wheat.source_year=="2017")]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
8,168,Angola,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),89.32,,...,,0.699893,58.062797,120.577203,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook (49...


In [29]:
angola_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
609,72.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
610,154.99,GFDx,2017,total population,estimated from FAO balance sheets
611,103.1,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007,total population,estimated from FAO balance sheets
873,75.0,GFDx,1995,total population,estimated from FAO balance sheets
874,90.0,GFDx,1996,total population,estimated from FAO balance sheets
875,96.0,GFDx,1997,total population,estimated from FAO balance sheets
876,95.0,GFDx,1998,total population,estimated from FAO balance sheets
877,109.0,GFDx,1999,total population,estimated from FAO balance sheets
878,100.0,GFDx,2000,total population,estimated from FAO balance sheets
879,104.0,GFDx,2001,total population,estimated from FAO balance sheets


In [30]:
angola_maize.loc[angola_maize.source_year=="2017","data_choice_notes"] = "Discarded 1997-2000 fortification handbook number (72 g/day) and 2007 Nuss number (103 g/day) in lieu of 2017 GFDx number (154 g/day). All numbers from FAO balance sheets."

output = output.append(
    angola_maize.loc[angola_maize.source_year=="2017"]
)

angola_maize.loc[angola_maize.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
610,168,Angola,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),154.99,,...,,0.819483,91.484132,218.495868,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook num...


## China

In [31]:
china = format_value_d(gday[gday.location_name=="China"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [32]:
china.vehicle.unique()

array(['wheat flour', 'maize flour'], dtype=object)

In [33]:
china_wheat = china[(china.vehicle=="wheat flour")]

In [34]:
china_wheat[['value_mean','source_citation','source_year','sub_population','source_type','pop_denom']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type,pop_denom
42,162.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets,capita CHECK
43,140.3,"GHDx, cite don't share DUA",2002,total population,Survey - cross-sectional,consumers
44,131.9,"GHDx, cite don't share DUA",2002,total population,Survey - cross-sectional,consumers
45,143.5,"GHDx, cite don't share DUA",2002,total population,Survey - cross-sectional,consumers
46,171.92,GFDx,2017,total population,estimated from FAO balance sheets,capita CHECK
47,185.0,Fortifying Flour Where People Eat Rice. Hunger...,2007,total population,estimated from FAO balance sheets,capita CHECK
494,216.0,GFDx,1995,total population,estimated from FAO balance sheets,capita CHECK
495,215.0,GFDx,1996,total population,estimated from FAO balance sheets,capita CHECK
496,213.0,GFDx,1997,total population,estimated from FAO balance sheets,capita CHECK
497,212.0,GFDx,1998,total population,estimated from FAO balance sheets,capita CHECK


In [35]:
china_wheat.loc[(china_wheat.source_year=="2017"),"data_choice_notes"] = "Discarded earlier FAO estimates for 2017 GFDx FAO estimate"

output = output.append(
    china_wheat.loc[(china_wheat.source_year=="2017")]
)

china_wheat.loc[(china_wheat.source_year=="2017")]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
46,-1,China,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),171.92,,...,,0.699893,111.757233,232.082767,mean,CHECK,g,day,capita CHECK,Discarded earlier FAO estimates for 2017 GFDx ...


## Ghana

In [36]:
ghana = format_value_d(gday[gday.location_name=="Ghana"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [37]:
ghana.vehicle.unique()

array(['wheat flour', 'maize flour', 'wheat(not specifically flour)',
       'bouillon', 'rice'], dtype=object)

In [38]:
ghana_wheat = ghana[(ghana.vehicle=="wheat flour")]
ghana_maize = ghana[(ghana.vehicle=="maize flour")]

In [39]:
ghana_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
54,23.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
55,86.36,GFDx,2017,total population,estimated from FAO balance sheets
56,0.8,"Nyumuah RO, Hoang TC, Amoaful EF, Agble R, Mey...",2010,total population,Survey - cross-sectional
516,21.0,GFDx,1995,total population,estimated from FAO balance sheets
517,16.0,GFDx,1996,total population,estimated from FAO balance sheets
518,31.0,GFDx,1997,total population,estimated from FAO balance sheets
519,32.0,GFDx,1998,total population,estimated from FAO balance sheets
520,37.0,GFDx,1999,total population,estimated from FAO balance sheets
521,40.0,GFDx,2000,total population,estimated from FAO balance sheets
522,35.0,GFDx,2001,total population,estimated from FAO balance sheets


In [40]:
ghana_wheat.loc[ghana_wheat.source_year=="2017","data_choice_notes"] = "Discarded 2017 1997-2000 fortification handbook number estimated from fao (23 g/day) and 2010 Nyumuah survey number (0.8 g/day; this might be a misextraction for nutrient) in lieu of 2017 GFDx number (86 g/day)"

output = output.append(
    ghana_wheat.loc[ghana_wheat.source_year=="2017"]
)

ghana_wheat.loc[ghana_wheat.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
55,207,Ghana,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),86.36,,...,,0.699893,56.138638,116.581362,mean,CHECK,g,day,capita CHECK,Discarded 2017 1997-2000 fortification handboo...


In [41]:
ghana_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
627,81.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
628,69.23,GFDx,2017,total population,estimated from FAO balance sheets
629,99.4,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007,total population,estimated from FAO balance sheets
895,61.0,GFDx,1995,total population,estimated from FAO balance sheets
896,58.0,GFDx,1996,total population,estimated from FAO balance sheets
897,57.0,GFDx,1997,total population,estimated from FAO balance sheets
898,54.0,GFDx,1998,total population,estimated from FAO balance sheets
899,55.0,GFDx,1999,total population,estimated from FAO balance sheets
900,55.0,GFDx,2000,total population,estimated from FAO balance sheets
901,49.0,GFDx,2001,total population,estimated from FAO balance sheets


In [42]:
ghana_maize.loc[(ghana_maize.source_year=="2017"),"data_choice_notes"] = "Discarded 1997-2000 number (81 g/day) and 2007 Nuss number (99 g/day) in lieu of 2017 GFDx number (69 g/day). All FAO balance sheet estimates."

output = output.append(
    ghana_maize.loc[(ghana_maize.source_year=="2017")]
)

ghana_maize.loc[(ghana_maize.source_year=="2017")]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
628,207,Ghana,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),69.23,,...,,0.819483,40.863581,97.596419,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 number (81 g/day) and 2007...


## Niger

In [43]:
niger = format_value_d(gday[gday.location_name=="Niger"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [44]:
niger.vehicle.unique()

array(['wheat flour', 'maize flour', 'wheat(not specifically flour)',
       'bouillon'], dtype=object)

In [45]:
niger_wheat = niger[(niger.vehicle=="wheat flour")]
niger_maize = niger[(niger.vehicle=="maize flour")]

In [46]:
niger_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
76,10.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
77,11.45,GFDx,2017,total population,estimated from FAO balance sheets
538,7.0,GFDx,1995,total population,estimated from FAO balance sheets
539,15.0,GFDx,1996,total population,estimated from FAO balance sheets
540,9.0,GFDx,1997,total population,estimated from FAO balance sheets
541,14.0,GFDx,1998,total population,estimated from FAO balance sheets
542,14.0,GFDx,1999,total population,estimated from FAO balance sheets
543,15.0,GFDx,2000,total population,estimated from FAO balance sheets
544,17.0,GFDx,2001,total population,estimated from FAO balance sheets
545,19.0,GFDx,2002,total population,estimated from FAO balance sheets


In [47]:
niger_wheat.loc[niger_wheat.source_year=="2017","data_choice_notes"] = "Discarded 2017 1997-2000 fortification handbook number estimated from fao (10 g/day)in lieu of 2017 GFDx number (11 g/day)"

output = output.append(
    niger_wheat.loc[niger_wheat.source_year=="2017"]
)

niger_wheat.loc[niger_wheat.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
77,213,Niger,na,,unknown,wheat flour,na,Mean per capita consumption (g/day),11.45,,...,,0.699893,7.443115,15.456885,mean,CHECK,g,day,capita CHECK,Discarded 2017 1997-2000 fortification handboo...


In [48]:
niger_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
637,10.63,GFDx,2017,total population,estimated from FAO balance sheets
638,8.3,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007,total population,estimated from FAO balance sheets
917,6.0,GFDx,1995,total population,estimated from FAO balance sheets
918,5.0,GFDx,1996,total population,estimated from FAO balance sheets
919,13.0,GFDx,1997,total population,estimated from FAO balance sheets
920,25.0,GFDx,1998,total population,estimated from FAO balance sheets
921,8.0,GFDx,1999,total population,estimated from FAO balance sheets
922,18.0,GFDx,2000,total population,estimated from FAO balance sheets
923,9.0,GFDx,2001,total population,estimated from FAO balance sheets
924,15.0,GFDx,2002,total population,estimated from FAO balance sheets


In [49]:
niger_maize.loc[niger_maize.source_year=="2017","data_choice_notes"] = "Discarded 2007 Nuss number (8 g/day) in lieu of 2017 GFDx number (11 g/day). Both are FAO estimates"

output = output.append(
    niger_maize.loc[niger_maize.source_year=="2017"]
)

niger_maize.loc[niger_maize.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
637,213,Niger,na,,unknown,maize flour,na,Mean per capita consumption (g/day),10.63,,...,,0.819483,6.274446,14.985554,mean,CHECK,g,day,capita CHECK,Discarded 2007 Nuss number (8 g/day) in lieu o...


## Egypt

In [50]:
egypt = format_value_d(gday[gday.location_name=="Egypt"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [51]:
egypt.vehicle.unique()

array(['wheat flour', 'maize flour'], dtype=object)

In [52]:
egypt_wheat = egypt[(egypt.vehicle=="wheat flour")]
egypt_maize = egypt[(egypt.vehicle=="maize flour")]

In [53]:
egypt_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
50,281.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
51,399.23,GFDx,2017,total population,estimated from FAO balance sheets
560,412.0,GFDx,1995,total population,estimated from FAO balance sheets
561,414.0,GFDx,1996,total population,estimated from FAO balance sheets
562,414.0,GFDx,1997,total population,estimated from FAO balance sheets
563,401.0,GFDx,1998,total population,estimated from FAO balance sheets
564,377.0,GFDx,1999,total population,estimated from FAO balance sheets
565,373.0,GFDx,2000,total population,estimated from FAO balance sheets
566,395.0,GFDx,2001,total population,estimated from FAO balance sheets
567,377.0,GFDx,2002,total population,estimated from FAO balance sheets


In [54]:
egypt_wheat.loc[egypt_wheat.source_year=="2017","data_choice_notes"] = "Discarded 1997-2000 fortification handbook number (281 g/day) in lieu of 2017 GFDx estimate (399 g/day). Both are FAO estimates."

output = output.append(
    egypt_wheat.loc[egypt_wheat.source_year=="2017"]
)

egypt_wheat.loc[egypt_wheat.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
51,-1,Egypt,na,,unknown,wheat flour,na,Mean per capita consumption (g/day),399.23,,...,,0.699893,259.520941,538.939059,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook num...


In [55]:
egypt_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
623,114.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
624,167.21,GFDx,2017,total population,estimated from FAO balance sheets
939,154.0,GFDx,1995,total population,estimated from FAO balance sheets
940,151.0,GFDx,1996,total population,estimated from FAO balance sheets
941,159.0,GFDx,1997,total population,estimated from FAO balance sheets
942,165.0,GFDx,1998,total population,estimated from FAO balance sheets
943,171.0,GFDx,1999,total population,estimated from FAO balance sheets
944,167.0,GFDx,2000,total population,estimated from FAO balance sheets
945,162.0,GFDx,2001,total population,estimated from FAO balance sheets
946,167.0,GFDx,2002,total population,estimated from FAO balance sheets


In [56]:
egypt_maize.loc[egypt_maize.source_year=="2017","data_choice_notes"] = "Discarded 1997-2000 fortification handbook number (114 g/day) in lieu of 2017 GFDx estimate (167 g/day). Both are FAO estimates."

output = output.append(
    egypt_maize.loc[egypt_maize.source_year=="2017"]
)

egypt_maize.loc[egypt_maize.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
624,-1,Egypt,na,,unknown,maize flour,na,Mean per capita consumption (g/day),167.21,,...,,0.819483,98.697088,235.722912,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook num...


## Sudan

In [57]:
sudan = format_value_d(gday[gday.location_name=="Sudan"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [58]:
sudan.vehicle.unique()

array(['wheat flour', 'maize flour', 'wheat(not specifically flour)'],
      dtype=object)

In [59]:
sudan_wheat = sudan[(sudan.vehicle=="wheat flour")]
sudan_maize = sudan[(sudan.vehicle=="maize flour")]

In [60]:
sudan_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
87,69.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
88,123.45,GFDx,2017,total population,estimated from FAO balance sheets
582,68.0,GFDx,2012,total population,estimated from FAO balance sheets
583,62.0,GFDx,2013,total population,estimated from FAO balance sheets
584,106.68,GFDx,2014,total population,estimated from FAO balance sheets
585,111.01,GFDx,2015,total population,estimated from FAO balance sheets
586,119.37,GFDx,2016,total population,estimated from FAO balance sheets


In [61]:
sudan_wheat.loc[sudan_wheat.source_year=="2017","data_choice_notes"] = "Discarded 1997-2000 fortification handbook number (69 g/day) in lieu of 2017 GFDx estimate (123 g/day). Both are FAO estimates."

output = output.append(
    sudan_wheat.loc[sudan_wheat.source_year=="2017"]
)

sudan_wheat.loc[sudan_wheat.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
88,522,Sudan,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),123.45,,...,,0.699893,80.24913,166.65087,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook num...


In [62]:
sudan_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
645,5.4,GFDx,2017,total population,estimated from FAO balance sheets
646,6.7,"Nuss, E. T., & Tanumihardjo, S. A. (2011). Qua...",2007,total population,estimated from FAO balance sheets
961,1.0,GFDx,2012,total population,estimated from FAO balance sheets
962,1.0,GFDx,2013,total population,estimated from FAO balance sheets
963,6.58,GFDx,2014,total population,estimated from FAO balance sheets
964,5.56,GFDx,2015,total population,estimated from FAO balance sheets
965,5.73,GFDx,2016,total population,estimated from FAO balance sheets


In [63]:
sudan_maize.loc[sudan_maize.source_year=="2017","data_choice_notes"] = "Discarded 2007 Nuss number (6.7 g/day) in lieu of 2017 GFDx estimate (5.4 g/day). Both are FAO estimates."

output = output.append(
    sudan_maize.loc[sudan_maize.source_year=="2017"]
)

sudan_maize.loc[sudan_maize.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
645,522,Sudan,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),5.4,,...,,0.819483,3.187395,7.612605,mean,CHECK,g,day,capita CHECK,Discarded 2007 Nuss number (6.7 g/day) in lieu...


## Madagascar

In [64]:
madagascar = format_value_d(gday[gday.location_name=="Madagascar"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [65]:
madagascar.vehicle.unique()

array(['wheat flour', 'maize flour', 'wheat(not specifically flour)'],
      dtype=object)

In [66]:
madagascar_wheat = madagascar[(madagascar.vehicle=="wheat flour")]
madagascar_maize = madagascar[(madagascar.vehicle=="maize flour")]

In [67]:
madagascar_wheat[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
63,12.0,Fortification Handbook: Vitamin and Mineral Fo...,1997-2000,total population,estimated from FAO balance sheets
64,35.37,GFDx,2017,total population,estimated from FAO balance sheets
587,19.0,GFDx,1995,total population,estimated from FAO balance sheets
588,14.0,GFDx,1996,total population,estimated from FAO balance sheets
589,13.0,GFDx,1997,total population,estimated from FAO balance sheets
590,16.0,GFDx,1998,total population,estimated from FAO balance sheets
591,17.0,GFDx,1999,total population,estimated from FAO balance sheets
592,18.0,GFDx,2000,total population,estimated from FAO balance sheets
593,19.0,GFDx,2001,total population,estimated from FAO balance sheets
594,15.0,GFDx,2002,total population,estimated from FAO balance sheets


In [68]:
madagascar_wheat.loc[madagascar_wheat.source_year=="2017","data_choice_notes"] = "Discarded 1997-2000 fortification handbook number (12 g/day) in lieu of 2017 GFDx estimate (35 g/day). Both are FAO estimates."

output = output.append(
    madagascar_wheat.loc[madagascar_wheat.source_year=="2017"]
)

madagascar_wheat.loc[madagascar_wheat.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
64,181,Madagascar,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),35.37,,...,,0.699893,22.9924,47.7476,mean,CHECK,g,day,capita CHECK,Discarded 1997-2000 fortification handbook num...


In [69]:
madagascar_maize[['value_mean','source_citation','source_year','sub_population','source_type']]

Unnamed: 0,value_mean,source_citation,source_year,sub_population,source_type
632,33.86,GFDx,2017,total population,estimated from FAO balance sheets
966,30.0,GFDx,1995,total population,estimated from FAO balance sheets
967,31.0,GFDx,1996,total population,estimated from FAO balance sheets
968,28.0,GFDx,1997,total population,estimated from FAO balance sheets
969,24.0,GFDx,1998,total population,estimated from FAO balance sheets
970,27.0,GFDx,1999,total population,estimated from FAO balance sheets
971,26.0,GFDx,2000,total population,estimated from FAO balance sheets
972,27.0,GFDx,2001,total population,estimated from FAO balance sheets
973,25.0,GFDx,2002,total population,estimated from FAO balance sheets
974,46.0,GFDx,2003,total population,estimated from FAO balance sheets


In [70]:
madagascar_maize.loc[madagascar_maize.source_year=="2017","data_choice_notes"] = "Only one source."

output = output.append(
    madagascar_maize.loc[madagascar_maize.source_year=="2017"]
)

madagascar_maize.loc[madagascar_maize.source_year=="2017"]

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,value_025_percentile,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
632,181,Madagascar,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),33.86,,...,,0.819483,19.986146,47.733854,mean,CHECK,g,day,capita CHECK,Only one source.


# Format output

In [71]:
group_cols = ['location_id','location_name','sub_population','vehicle','metric','mass_unit','time_unit','pop_denom','source_citation','source_link','data_choice_notes']

In [72]:
loc_metadata = get_locs(location_set_id=35, gbd_round_id=6, decomp_step="step4")

In [73]:
loc_metadata = loc_metadata.loc[(loc_metadata.location_name.isin(location_names)) & (loc_metadata.level==3),['location_id','location_name']]

In [74]:
output = output.drop(columns = 'location_id')
output = loc_metadata.merge(output, on = 'location_name', how = 'right')

In [75]:
output[['location_id','location_name','vehicle','value_mean','value_description','definition validated', 'CI validated']]

Unnamed: 0,location_id,location_name,vehicle,value_mean,value_description,definition validated,CI validated
0,141,Egypt,wheat flour,399.23,Mean per capita consumption (g/day),Yes,
1,141,Egypt,maize flour,167.21,Mean per capita consumption (g/day),Yes,
2,522,Sudan,wheat flour,123.45,Mean per capita consumption (g/day),Yes,
3,522,Sudan,maize flour,5.4,Mean per capita consumption (g/day),Yes,
4,6,China,wheat flour,171.92,Mean per capita consumption (g/day),Yes,
5,168,Angola,wheat flour,89.32,Mean per capita consumption (g/day),Yes,
6,168,Angola,maize flour,154.99,Mean per capita consumption (g/day),Yes,
7,181,Madagascar,wheat flour,35.37,Mean per capita consumption (g/day),Yes,
8,181,Madagascar,maize flour,33.86,Mean per capita consumption (g/day),Yes,
9,207,Ghana,wheat flour,86.36,Mean per capita consumption (g/day),Yes,


In [76]:
metric_map = {
    'mean':'mean',
    'CHECK':'mean'
}

pop_denom_map = {
    'capita':'capita',
    'CHECK':'capita',
    'capita CHECK':'capita',
    'consumers':'consumers'
}

output.metric = output.metric.map(metric_map)
output.pop_denom = output.pop_denom.map(pop_denom_map)

In [77]:
output[['value_mean'] + group_cols]

Unnamed: 0,value_mean,location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,source_citation,source_link,data_choice_notes
0,399.23,141,Egypt,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
1,167.21,141,Egypt,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
2,123.45,522,Sudan,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
3,5.4,522,Sudan,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2007 Nuss number (6.7 g/day) in lieu...
4,171.92,6,China,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded earlier FAO estimates for 2017 GFDx ...
5,89.32,168,Angola,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook (49...
6,154.99,168,Angola,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
7,35.37,181,Madagascar,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
8,33.86,181,Madagascar,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Only one source.
9,86.36,207,Ghana,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2017 1997-2000 fortification handboo...


In [78]:
output.loc[(output.mass_unit=="kg"),'value_mean'] = output.value_mean * 1_000

output.loc[(output.mass_unit=="kg"),'lower'] = output.lower * 1_000
output.loc[(output.mass_unit=="kg"),'upper'] = output.upper * 1_000

output.loc[(output.mass_unit=="kg"),'value_025_percentile'] = output.value_025_percentile * 1_000
output.loc[(output.mass_unit=="kg"),'value_975_percentile'] = output.value_975_percentile * 1_000

output.loc[(output.mass_unit=="kg"),'mass_unit'] = 'g'

In [79]:
output.loc[(output.time_unit=="year"),'value_mean'] = output.value_mean / 365

output.loc[(output.time_unit=="year"),'lower'] = output.lower / 365
output.loc[(output.time_unit=="year"),'upper'] = output.upper / 365

output.loc[(output.time_unit=="year"),'value_025_percentile'] = output.value_025_percentile / 365
output.loc[(output.time_unit=="year"),'value_975_percentile'] = output.value_975_percentile / 365

output.loc[(output.time_unit=="year"),'time_unit'] = 'day'

In [80]:
output[['value_mean'] + group_cols]

Unnamed: 0,value_mean,location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,source_citation,source_link,data_choice_notes
0,399.23,141,Egypt,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
1,167.21,141,Egypt,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
2,123.45,522,Sudan,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
3,5.4,522,Sudan,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2007 Nuss number (6.7 g/day) in lieu...
4,171.92,6,China,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded earlier FAO estimates for 2017 GFDx ...
5,89.32,168,Angola,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook (49...
6,154.99,168,Angola,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
7,35.37,181,Madagascar,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
8,33.86,181,Madagascar,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Only one source.
9,86.36,207,Ghana,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2017 1997-2000 fortification handboo...


In [81]:
output[['value_mean','lower','upper'] + group_cols]

Unnamed: 0,value_mean,lower,upper,location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,source_citation,source_link,data_choice_notes
0,399.23,259.520941,538.939059,141,Egypt,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
1,167.21,98.697088,235.722912,141,Egypt,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
2,123.45,80.24913,166.65087,522,Sudan,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
3,5.4,3.187395,7.612605,522,Sudan,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2007 Nuss number (6.7 g/day) in lieu...
4,171.92,111.757233,232.082767,6,China,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded earlier FAO estimates for 2017 GFDx ...
5,89.32,58.062797,120.577203,168,Angola,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook (49...
6,154.99,91.484132,218.495868,168,Angola,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
7,35.37,22.9924,47.7476,181,Madagascar,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
8,33.86,19.986146,47.733854,181,Madagascar,total population,maize flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Only one source.
9,86.36,56.138638,116.581362,207,Ghana,total population,wheat flour,mean,g,day,capita,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2017 1997-2000 fortification handboo...


In [82]:
output = output.rename(columns = {
    'source_citation':'mean_value_source_citation',
    'source_link':'mean_value_source_link'
})

In [83]:
assert(len(output[output.value_mean >= output.upper])==0), "check upper"
assert(len(output[output.value_mean <= output.lower])==0), "check lower"

# check for missing values

In [84]:
output.loc[(output.location_name=="Vietnam"),'location_name'] = "Viet Nam"

In [85]:
## load legal combos
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_vehicle_country_pairs.pickle', 'rb') as handle:
    vehicle_country_map = pickle.load(handle)

In [86]:
target_vehicles = [i for i in vehicle_nutrient_map.keys() if 'iron' in vehicle_nutrient_map[i]]
set([i for j in location_names for i in country_vehicle_map[j]])

{'bouillon', 'maize flour', 'oil', 'rice', 'wheat flour'}

In [87]:
# make sure nothing missing. note we're not interested in oil/iron
output.reset_index().merge(target.reset_index(), on = ['location_name','vehicle'], how = 'outer')

Unnamed: 0,index,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,value_description,value_mean,...,scale_over_mean,r,lower,upper,metric,entity,mass_unit,time_unit,pop_denom,data_choice_notes
0,0.0,141.0,Egypt,na,,unknown,wheat flour,na,Mean per capita consumption (g/day),399.23,...,,0.699893,259.520941,538.939059,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook num...
1,1.0,141.0,Egypt,na,,unknown,maize flour,na,Mean per capita consumption (g/day),167.21,...,,0.819483,98.697088,235.722912,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook num...
2,2.0,522.0,Sudan,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),123.45,...,,0.699893,80.24913,166.65087,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook num...
3,3.0,522.0,Sudan,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),5.4,...,,0.819483,3.187395,7.612605,mean,CHECK,g,day,capita,Discarded 2007 Nuss number (6.7 g/day) in lieu...
4,4.0,6.0,China,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),171.92,...,,0.699893,111.757233,232.082767,mean,CHECK,g,day,capita,Discarded earlier FAO estimates for 2017 GFDx ...
5,5.0,168.0,Angola,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),89.32,...,,0.699893,58.062797,120.577203,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook (49...
6,6.0,168.0,Angola,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),154.99,...,,0.819483,91.484132,218.495868,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook num...
7,7.0,181.0,Madagascar,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),35.37,...,,0.699893,22.9924,47.7476,mean,CHECK,g,day,capita,Discarded 1997-2000 fortification handbook num...
8,8.0,181.0,Madagascar,na,,mixed/both,maize flour,na,Mean per capita consumption (g/day),33.86,...,,0.819483,19.986146,47.733854,mean,CHECK,g,day,capita,Only one source.
9,9.0,207.0,Ghana,na,,mixed/both,wheat flour,na,Mean per capita consumption (g/day),86.36,...,,0.699893,56.138638,116.581362,mean,CHECK,g,day,capita,Discarded 2017 1997-2000 fortification handboo...


In [88]:
#reorder columns
output = output[['location_id', 'location_name', 'sub_population', 'vehicle', 'metric',
       'mass_unit', 'time_unit', 'pop_denom','value_mean', 'lower',
       'upper', 'mean_value_source_citation',
       'mean_value_source_link', 'data_choice_notes']]

In [89]:
output

Unnamed: 0,location_id,location_name,sub_population,vehicle,metric,mass_unit,time_unit,pop_denom,value_mean,lower,upper,mean_value_source_citation,mean_value_source_link,data_choice_notes
0,141,Egypt,total population,wheat flour,mean,g,day,capita,399.23,259.520941,538.939059,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
1,141,Egypt,total population,maize flour,mean,g,day,capita,167.21,98.697088,235.722912,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
2,522,Sudan,total population,wheat flour,mean,g,day,capita,123.45,80.24913,166.65087,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
3,522,Sudan,total population,maize flour,mean,g,day,capita,5.4,3.187395,7.612605,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2007 Nuss number (6.7 g/day) in lieu...
4,6,China,total population,wheat flour,mean,g,day,capita,171.92,111.757233,232.082767,GFDx,https://fortificationdata.org/country-fortific...,Discarded earlier FAO estimates for 2017 GFDx ...
5,168,Angola,total population,wheat flour,mean,g,day,capita,89.32,58.062797,120.577203,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook (49...
6,168,Angola,total population,maize flour,mean,g,day,capita,154.99,91.484132,218.495868,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
7,181,Madagascar,total population,wheat flour,mean,g,day,capita,35.37,22.9924,47.7476,GFDx,https://fortificationdata.org/country-fortific...,Discarded 1997-2000 fortification handbook num...
8,181,Madagascar,total population,maize flour,mean,g,day,capita,33.86,19.986146,47.733854,GFDx,https://fortificationdata.org/country-fortific...,Only one source.
9,207,Ghana,total population,wheat flour,mean,g,day,capita,86.36,56.138638,116.581362,GFDx,https://fortificationdata.org/country-fortific...,Discarded 2017 1997-2000 fortification handboo...


In [90]:
output = output.sort_values(['location_name','vehicle'])

In [91]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/gday_tier5_04_02_2021.csv'
output.to_csv(save_path, index = False)

In [92]:
save_path = '/ihme/homes/beatrixh/repos/scratch/gday_tier5_04_02_2021.csv'
output.to_csv(save_path, index = False)