In [1]:
import pandas as pd, numpy as np

# Add 'industry oil' to coverage input data

In [2]:
## pull in (previously final) input data sheet
outputs_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/'
input_data_path = outputs_dir + 'lsff_input_coverage_data.csv'

input_data = pd.read_csv(input_data_path)

loc_id_map = {i:j for(i,j) in zip(input_data.location_name, input_data.location_id) }

In [3]:
# pull in (previously final) version of input data sheet with metadata
w_meta = pd.read_csv(outputs_dir + 'lsff_input_coverage_data_with_metadata.csv')

In [4]:
# pull in new numbers from Jonathan / industry
inputs_dir = "/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/"
new_data = pd.read_excel(inputs_dir + "20210411_LSFF_input_data_for_IHME.xlsx", header = [1,2,3])

In [5]:
# subset to the oil numbers we're interested in
new_data = new_data.set_index(new_data.columns[0])

new_oil = new_data['Baseline coverage']['Oil\n(percent of population eating specified oil)'][['Fortifiable oil.1','Compliance % of fortifiable']]

In [6]:
# pull columns we're interested in
rename_oil = {
    ('Location', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'): 'location_name',
 'Fortifiable oil.1': 'percent of population eating industrially produced vehicle',
 'Compliance % of fortifiable': 'percent of population eating fortified vehicle'
}

new_oil = new_oil.reset_index().rename(columns=rename_oil)

In [7]:
# reshape to our format
new_oil = pd.melt(new_oil, id_vars = 'location_name',
        value_vars = ['percent of population eating industrially produced vehicle',
       'percent of population eating fortified vehicle'],
       var_name='value_description',value_name='value_mean')

In [8]:
# fix c'ote divoire 
new_oil.loc[new_oil.location_name=="CÃ´te d'Ivoire","location_name"] = "Côte d'Ivoire"

In [9]:
# make sure we're not missing any locations / don't have any extra
assert(set(new_oil.location_name).symmetric_difference(set(input_data.location_name))==set())

In [10]:
# formatting
new_oil['vehicle'] = 'industry oil'
new_oil['nutrient'] = 'vitamin a'
new_oil['u5_applicable'] = True
new_oil['wra_applicable'] = True
new_oil['sub_population'] = np.NaN
new_oil['location_id'] = new_oil.location_name.map(loc_id_map)

In [11]:
# check formatting
new_oil.head()

Unnamed: 0,location_name,value_description,value_mean,vehicle,nutrient,u5_applicable,wra_applicable,sub_population,location_id
0,Angola,percent of population eating industrially prod...,72.32,industry oil,vitamin a,True,True,,168
1,Bangladesh,percent of population eating industrially prod...,87.516,industry oil,vitamin a,True,True,,161
2,Burkina Faso,percent of population eating industrially prod...,73.76,industry oil,vitamin a,True,True,,201
3,Côte d'Ivoire,percent of population eating industrially prod...,72.0,industry oil,vitamin a,True,True,,205
4,Cameroon,percent of population eating industrially prod...,72.0,industry oil,vitamin a,True,True,,202


In [12]:
# from our (previously final) input sheet, select extracted rows of data (no regression or multiplicative results)
# then drop any rows duplicated for different nutrients
w_meta = w_meta[w_meta.estimation_status=='na'].drop(columns = 'nutrient').drop_duplicates()

In [13]:
# use ratio of scale over mean to estimate scale for new industry datapoints
w_meta['scale_over_mean'] = (w_meta.value_975_percentile - w_meta.value_025_percentile) / w_meta.value_mean
r = w_meta.loc[(w_meta.scale_over_mean!=np.inf) & (w_meta.scale_over_mean!=0),['vehicle','scale_over_mean']]

r_mean = r.scale_over_mean.mean()
r.loc[r.vehicle=="maize flour",'scale_over_mean'] = r_mean

r = r.groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'}).reset_index()

In [14]:
r

Unnamed: 0,vehicle,r
0,maize flour,0.564982
1,oil,0.379944
2,wheat flour,0.688639


In [15]:
# we'll use the oil-specific ratio
new_oil['r'] = 0.379944

In [16]:
new_oil.head()

Unnamed: 0,location_name,value_description,value_mean,vehicle,nutrient,u5_applicable,wra_applicable,sub_population,location_id,r
0,Angola,percent of population eating industrially prod...,72.32,industry oil,vitamin a,True,True,,168,0.379944
1,Bangladesh,percent of population eating industrially prod...,87.516,industry oil,vitamin a,True,True,,161,0.379944
2,Burkina Faso,percent of population eating industrially prod...,73.76,industry oil,vitamin a,True,True,,201,0.379944
3,Côte d'Ivoire,percent of population eating industrially prod...,72.0,industry oil,vitamin a,True,True,,205,0.379944
4,Cameroon,percent of population eating industrially prod...,72.0,industry oil,vitamin a,True,True,,202,0.379944


In [18]:
# input estimated CIs
new_oil['value_025_percentile'] = np.clip(new_oil.value_mean - (new_oil.r * new_oil.value_mean)/2, 0, 100)
new_oil['value_975_percentile'] = np.clip(new_oil.value_mean + (new_oil.r * new_oil.value_mean)/2, 0, 100)

In [19]:
output = input_data.append(new_oil)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [20]:
#checkout results
output.tail()

Unnamed: 0,location_id,location_name,nutrient,r,sub_population,u5_applicable,value_025_percentile,value_975_percentile,value_description,value_mean,vehicle,wra_applicable
45,196,South Africa,vitamin a,0.379944,,True,2.430084,3.569916,percent of population eating fortified vehicle,3.0,industry oil,True
46,522,Sudan,vitamin a,0.379944,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,industry oil,True
47,190,Uganda,vitamin a,0.379944,,True,44.065523,64.734477,percent of population eating fortified vehicle,54.4,industry oil,True
48,189,United Republic of Tanzania,vitamin a,0.379944,,True,43.417501,63.782499,percent of population eating fortified vehicle,53.6,industry oil,True
49,20,Viet Nam,vitamin a,0.379944,,True,56.70196,83.29804,percent of population eating fortified vehicle,70.0,industry oil,True


In [21]:
output.columns

Index(['location_id', 'location_name', 'nutrient', 'r', 'sub_population',
       'u5_applicable', 'value_025_percentile', 'value_975_percentile',
       'value_description', 'value_mean', 'vehicle', 'wra_applicable'],
      dtype='object')

In [None]:
# checkout problem rows

In [29]:
output.loc[(output.value_mean > output.value_975_percentile) | (output.value_mean < output.value_025_percentile)]

Unnamed: 0,location_id,location_name,nutrient,r,sub_population,u5_applicable,value_025_percentile,value_975_percentile,value_description,value_mean,vehicle,wra_applicable


In [34]:
output.loc[(output.value_mean == output.value_975_percentile) & (output.value_description!='percent of population eating vehicle')]

Unnamed: 0,location_id,location_name,nutrient,r,sub_population,u5_applicable,value_025_percentile,value_975_percentile,value_description,value_mean,vehicle,wra_applicable
2,179,Ethiopia,folic acid,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
3,179,Ethiopia,iron,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
4,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
5,179,Ethiopia,zinc,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
9,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,oil,True
13,179,Ethiopia,folic acid,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
14,179,Ethiopia,iron,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
15,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
16,179,Ethiopia,zinc,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
56,165,Pakistan,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True


In [33]:
output.loc[(output.value_mean == output.value_025_percentile) & (output.value_description!='percent of population eating vehicle')]

Unnamed: 0,location_id,location_name,nutrient,r,sub_population,u5_applicable,value_025_percentile,value_975_percentile,value_description,value_mean,vehicle,wra_applicable
2,179,Ethiopia,folic acid,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
3,179,Ethiopia,iron,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
4,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
5,179,Ethiopia,zinc,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,maize flour,True
9,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,oil,True
13,179,Ethiopia,folic acid,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
14,179,Ethiopia,iron,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
15,179,Ethiopia,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
16,179,Ethiopia,zinc,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True
56,165,Pakistan,vitamin a,,,True,0.0,0.0,percent of population eating fortified vehicle,0.0,wheat flour,True


In [24]:
output.to_csv('/ihme/homes/beatrixh/repos/scratch/lsff_input_coverage_data.csv')

In [28]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_input_coverage_data_w_industry_oil_04_12_2021.csv'
output.to_csv(save_path)