In [1]:
import pandas as pd, numpy as np

In [2]:
path = '/ihme/homes/beatrixh/notebooks/viv_rsc/new_lsff/inputs/extraction_sheet_lsff_01_06_2021.csv'
df = pd.read_csv(path)

# LSFF

# Look through data by hand; choose all country-vehicle pairs for which we have all the data we want, and clean up for use.

IN: extraction sheet + vehicle-nutrient, country-vehicle dicts

ASSUME: for vehicle-locations where we have some but not all nutrients, we assign, to the remaining nutrients, the same value as the extracted nutrients if there is evidence of fortification (in GFDx), else 0

OUT: cleaned dataset for india, nigeria, cote d'ivoire, uganda, tanzania, south africa

## one country at a time, checkout data quality

In [3]:
check_cols = ['location_id','location_name','urbanicity','subnational_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','sub_population','notes','source_citation']

def filter_data(country, vehicle, val):    
    output = df.loc[(df.location_name==country)
           & (df.vehicle==vehicle)
           & (df.value_description==val)
           & (df.value_mean.notna())
           & (df.value_025_percentile.notna()),check_cols]
    
    return output

In [4]:
def check_one_country(country):
    vehicles = ['oil', 'wheat flour', 'salt', 'maize flour', 'rice', 'bouillon']
    values_gold = ['percent of population eating fortified vehicle',
               'percent of population eating industrially produced vehicle',
               'percent of population eating vehicle']
    return pd.concat([filter_data(country, vehicle, val) for vehicle in vehicles for val in values_gold])

In [5]:
usecols = ['location_id','location_name','subnational_name','vehicle','value_description','nutrient','value_mean', 'value_025_percentile',
       'value_975_percentile']
subset_data = {}

In [6]:
india = check_one_country('India')
india = india[india.vehicle!='salt'] #missing CIs
subset_data['India'] = india[usecols]

In [7]:
nigeria = check_one_country('Nigeria')
nigeria = nigeria[~nigeria.vehicle.isin(['salt','bouillon'])] #missing proportion of the population eating salt. missing lots for bouillon

subset_data['Nigeria'] = nigeria[usecols]

In [8]:
pakistan = check_one_country('Pakistan')
pakistan.vehicle.unique()
# for oil and wheat flour: missing percent of population eating fortified vehicle

array(['oil', 'wheat flour'], dtype=object)

In [9]:
# check_one_country("Burkina Faso")
# missing almost everything

In [10]:
cotedivoire = check_one_country("CÃ´te d'Ivoire")
cotedivoire = cotedivoire[cotedivoire.vehicle!="wheat flour"]

subset_data["CÃ´te d'Ivoire"] = cotedivoire[usecols]

In [11]:
uganda = check_one_country("Uganda")
uganda = uganda.loc[(uganda.source_citation!='GFDx') & (uganda.value_025_percentile!='na'),usecols] #duplicate entries
subset_data['Uganda'] = uganda

In [12]:
tanzania = check_one_country('United Republic of Tanzania')

tanzania = tanzania[tanzania.source_citation=="Grant J Aaron, Valerie M Friesen, Svenja Jungjohann, Greg S Garrett, Lynnette M Neufeld, Mark Myatt, Coverage of Large-Scale Food Fortification of Edible Oil, Wheat Flour, and Maize Flour Varies Greatly by Vehicle and Country but Is Consistently Lower among the Most Vulnerable: Results from Coverage Surveys in 8 Countries, The Journal of Nutrition, Volume 147, Issue 5, May 2017, Pages 984S–994S, https://doi.org/10.3945/jn.116.245770"]
subset_data['United Republic of Tanzania'] = tanzania

In [13]:
# nepal = check_one_country('Nepal') #missing percent pop eating fortified wheat, + everything else for other vehicles

In [14]:
safrica = check_one_country('South Africa')

subset_data['South Africa'] = safrica

## for usable country-vehicle pairs, subset to a minimal set & clean

## clean vehicle names

In [15]:
all_data = pd.concat(list(subset_data.values()))

In [16]:
all_data.nutrient.unique()

array(['vitamin a',
       'd, ergocalciferol-D2, cholecalciferol-D3, alfacalcidol', 'na',
       'iron', 'folic acid, folate, b9', 'b12', 'zinc', 'b1, thiamine',
       'b2, riboflavin', 'b3, niacin', 'b6, pyridoxine'], dtype=object)

In [17]:
nutrient_map = {'d, ergocalciferol-D2, cholecalciferol-D3, alfacalcidol':'vitamin d',
                'folic acid, folate, b9':'folic acid',
                'b12':'vitamin b12',
                'b2, riboflavin':'vitamin b2',
                'b3, niacin':'vitamin b3',
                'b1, thiamine':'vitamin b1',
                'b6, pyridoxine':'vitamin b6',
                'vitamin a':'vitamin a',
                'iron':'iron',
                'zinc':'zinc'
               }

In [18]:
for country in subset_data.keys():
    subset_data[country]['nutrient'] = subset_data[country]['nutrient'].map(nutrient_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
subset_data.keys()

dict_keys(['India', 'Nigeria', "CÃ´te d'Ivoire", 'Uganda', 'United Republic of Tanzania', 'South Africa'])

# Format and prep per subnational

In [20]:
final_data = {}

## India

In [21]:
subset_data['India']

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,163.0,India,Rajasthan,oil,percent of population eating fortified vehicle,vitamin a,24.3,21.1,27.9
481,163.0,India,Rajasthan,oil,percent of population eating fortified vehicle,vitamin d,24.3,21.1,27.9
1,163.0,India,Rajasthan,oil,percent of population eating industrially prod...,,89.4,87.0,91.8
2,163.0,India,Rajasthan,oil,percent of population eating vehicle,,100.0,100.0,100.0
3,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,iron,6.3,4.8,7.9
27,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,folic acid,6.3,4.8,7.9
28,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,vitamin b12,6.3,4.8,7.9
4,163.0,India,Rajasthan,wheat flour,percent of population eating industrially prod...,,7.1,5.6,9.1
26,163.0,India,Rajasthan,wheat flour,percent of population eating vehicle,,83.2,79.5,86.5


In [22]:
k = 3
additional_india = pd.DataFrame({'location_id':[163]*k,
                           'location_name':["India"]*k,
                           'vehicle':['wheat flour']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['zinc','vitamin b1','vitamin a'],
                           'value_mean':[6.3]*k,
                           'value_025_percentile':[4.8]*k,
                           'value_975_percentile':[7.9]*k})
additional_india

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,163,India,wheat flour,percent of population eating fortified vehicle,zinc,6.3,4.8,7.9
1,163,India,wheat flour,percent of population eating fortified vehicle,vitamin b1,6.3,4.8,7.9
2,163,India,wheat flour,percent of population eating fortified vehicle,vitamin a,6.3,4.8,7.9


In [23]:
def add_additional(df1, df2):
    return pd.concat([df1, df2]).sort_values(['location_name','vehicle','value_description','nutrient'])

In [24]:
final_data["India"] = add_additional(subset_data['India'], additional_india)

In [25]:
final_data["India"]

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,163.0,India,Rajasthan,oil,percent of population eating fortified vehicle,vitamin a,24.3,21.1,27.9
481,163.0,India,Rajasthan,oil,percent of population eating fortified vehicle,vitamin d,24.3,21.1,27.9
1,163.0,India,Rajasthan,oil,percent of population eating industrially prod...,,89.4,87.0,91.8
2,163.0,India,Rajasthan,oil,percent of population eating vehicle,,100.0,100.0,100.0
27,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,folic acid,6.3,4.8,7.9
3,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,iron,6.3,4.8,7.9
2,163.0,India,,wheat flour,percent of population eating fortified vehicle,vitamin a,6.3,4.8,7.9
1,163.0,India,,wheat flour,percent of population eating fortified vehicle,vitamin b1,6.3,4.8,7.9
28,163.0,India,Rajasthan,wheat flour,percent of population eating fortified vehicle,vitamin b12,6.3,4.8,7.9
0,163.0,India,,wheat flour,percent of population eating fortified vehicle,zinc,6.3,4.8,7.9


## Nigeria

In [26]:
nigeria1 = subset_data['Nigeria']

In [27]:
df.columns

Index(['location_id', 'location_name', 'subnational_name',
       'subnational_location_id', 'urbanicity', 'vehicle', 'nutrient',
       'nutrient_compound', 'nutrient_mass_ppm', 'fortification_standards',
       'compliance', 'compliance_source', 'value_description',
       'fortification_status', 'value_mean', 'value_025_percentile',
       'value_975_percentile', 'sub_population', 'source_citation',
       'source_link', 'source_year', 'source_type', 'source_additional',
       'notes', 'user', 'date_recorded', 'Unnamed: 26'],
      dtype='object')

In [28]:
def double_check_data(country, vehicle, val):
    subset = df.loc[(df.location_name==country) &
                    (df.vehicle==vehicle) &
                    (df.value_description==val),
                    ['vehicle','value_description','nutrient','value_mean','value_025_percentile','value_975_percentile']] 
    return subset

In [29]:
# double check that we have no data on nigeria oil fortified with vitamin d
double_check_data("Nigeria","oil","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
44,oil,percent of population eating fortified vehicle,"d, ergocalciferol-D2, cholecalciferol-D3, alfa...",na,na,na
66,oil,percent of population eating fortified vehicle,vitamin a,7.2,5.5,8.9
87,oil,percent of population eating fortified vehicle,vitamin a,7.6,5.9,9.4
97,oil,percent of population eating fortified vehicle,"d, ergocalciferol-D2, cholecalciferol-D3, alfa...",na,na,na


In [30]:
# zero out nigeria oil fortified with vitamin d
nigeria1.loc[(nigeria1.value_description=="percent of population eating fortified vehicle") &
             (nigeria1.nutrient=='vitamin d'),['value_mean','value_025_percentile','value_975_percentile']] = [0,0,5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [31]:
# double check we have no nigeria/wheat/b12 info
double_check_data("Nigeria","wheat flour","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
67,wheat flour,percent of population eating fortified vehicle,zinc,5.4,3.8,6.9
70,wheat flour,percent of population eating fortified vehicle,iron,5.4,3.8,6.9
71,wheat flour,percent of population eating fortified vehicle,"folic acid, folate, b9",5.4,3.8,6.9
72,wheat flour,percent of population eating fortified vehicle,vitamin a,5.4,3.8,6.9
73,wheat flour,percent of population eating fortified vehicle,"b1, thiamine",5.4,3.8,6.9
98,wheat flour,percent of population eating fortified vehicle,iron,22.7,20.0,25.5
99,wheat flour,percent of population eating fortified vehicle,"folic acid, folate, b9",22.7,20.0,25.5
100,wheat flour,percent of population eating fortified vehicle,vitamin a,22.7,20.0,25.5
101,wheat flour,percent of population eating fortified vehicle,zinc,22.7,20.0,25.5
102,wheat flour,percent of population eating fortified vehicle,"b1, thiamine",22.7,20.0,25.5


In [32]:
#add wheat/b12 info
k = 2
additional_nigeria0 = pd.DataFrame({'location_id':[214]*k,
                                   'location_name':["Nigeria"]*k,
                                   'subnational_name':['lagos','kano'],
                                   'vehicle':['wheat flour']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['vitamin b12']*k,
                           'value_mean':[5.4,22.7],
                           'value_025_percentile':[3.8,20],
                           'value_975_percentile':[6.9,25.5]})
additional_nigeria0

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,214,Nigeria,lagos,wheat flour,percent of population eating fortified vehicle,vitamin b12,5.4,3.8,6.9
1,214,Nigeria,kano,wheat flour,percent of population eating fortified vehicle,vitamin b12,22.7,20.0,25.5


In [33]:
# double check we have no nigeria/maize/b12 info
double_check_data("Nigeria","maize flour","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
78,maize flour,percent of population eating fortified vehicle,"folic acid, folate, b9",0.2,0.0,0.5
79,maize flour,percent of population eating fortified vehicle,vitamin a,0.2,0.0,0.5
80,maize flour,percent of population eating fortified vehicle,zinc,0.2,0.0,0.5
105,maize flour,percent of population eating fortified vehicle,"folic acid, folate, b9",1.7,0.9,2.6
106,maize flour,percent of population eating fortified vehicle,vitamin a,1.7,0.9,2.6
107,maize flour,percent of population eating fortified vehicle,zinc,1.7,0.9,2.6


In [34]:
#add maize nutrient info
i = 2
j = 3
k = i*j
additional_nigeria1 = pd.DataFrame({'location_id':[214]*k,
                                   'location_name':["Nigeria"]*k,
                                   'subnational_name':['lagos','kano']*j,
                                   'vehicle':['maize flour']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['iron','vitamin b1','vitamin b12']*i,
                           'value_mean':[0.2,1.7]*j,
                           'value_025_percentile':[0,0.9]*j,
                           'value_975_percentile':[0.5,2.6]*j})
additional_nigeria1

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,214,Nigeria,lagos,maize flour,percent of population eating fortified vehicle,iron,0.2,0.0,0.5
1,214,Nigeria,kano,maize flour,percent of population eating fortified vehicle,vitamin b1,1.7,0.9,2.6
2,214,Nigeria,lagos,maize flour,percent of population eating fortified vehicle,vitamin b12,0.2,0.0,0.5
3,214,Nigeria,kano,maize flour,percent of population eating fortified vehicle,iron,1.7,0.9,2.6
4,214,Nigeria,lagos,maize flour,percent of population eating fortified vehicle,vitamin b1,0.2,0.0,0.5
5,214,Nigeria,kano,maize flour,percent of population eating fortified vehicle,vitamin b12,1.7,0.9,2.6


In [35]:
# look closer at rice data we have; seems incongruous
df.loc[(df.location_name=="Nigeria") & (df.vehicle=="rice"),['subnational_name','value_description','value_mean','source_link','source_year']]

Unnamed: 0,subnational_name,value_description,value_mean,source_link,source_year
124,na,,,,
457,na,percent of vehicle that is industrially produced,10,https://fortificationdata.org/chart-quantity-a...,2017
494,na,percent of vehicle that is industrially produced,na,https://issuu.com/sight_and_life/docs/sightand...,2013
495,na,percent of population eating vehicle,14.9,https://issuu.com/sight_and_life/docs/sightand...,2001-2003
508,Ebonyi,percent of population eating industrially prod...,100,https://www.gainhealth.org/sites/default/files...,2018
509,Sokoto,percent of population eating industrially prod...,99,https://www.gainhealth.org/sites/default/files...,2018


In [36]:
nigeria2 = add_additional(nigeria1[nigeria1.vehicle!='rice'], pd.concat([additional_nigeria0, additional_nigeria1]))
nigeria2.loc[nigeria2.vehicle!='rice','subnational_weight'] = 'by pop, from year 2015'

In [37]:
final_data["Nigeria"] = nigeria2

## CÃ´te d'Ivoire

In [38]:
cotedivoire1 = subset_data["CÃ´te d'Ivoire"]
cotedivoire1

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
94,205.0,CÃ´te d'Ivoire,na,oil,percent of population eating fortified vehicle,vitamin a,70.0,na,na
95,205.0,CÃ´te d'Ivoire,na,oil,percent of population eating fortified vehicle,vitamin a,75.0,na,na
600,205.0,CÃ´te d'Ivoire,Abidjan,oil,percent of population eating fortified vehicle,vitamin a,98.0,97,99
599,205.0,CÃ´te d'Ivoire,Abidjan,oil,percent of population eating industrially prod...,,98.0,97,99
598,205.0,CÃ´te d'Ivoire,Abidjan,oil,percent of population eating vehicle,,98.5,97.5,99.3


In [39]:
cotedivoire1 = cotedivoire1[cotedivoire1.value_025_percentile!='na']

In [40]:
# make sure no data on vitamin d
double_check_data("CÃ´te d'Ivoire","oil","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
94,oil,percent of population eating fortified vehicle,vitamin a,70,na,na
95,oil,percent of population eating fortified vehicle,vitamin a,75,na,na
600,oil,percent of population eating fortified vehicle,vitamin a,98,97,99


In [41]:
k = 1
additional_cotedivoire = pd.DataFrame({'location_id':[205]*k,
                           'location_name':["CÃ´te d'Ivoire"]*k,
                           'vehicle':['oil']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['vitamin d'],
                           'value_mean':[0]*k,
                           'value_025_percentile':[0]*k,
                           'value_975_percentile':[5]*k})
additional_cotedivoire

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,205,CÃ´te d'Ivoire,oil,percent of population eating fortified vehicle,vitamin d,0,0,5


In [42]:
cotedivoire2 = add_additional(cotedivoire1, additional_cotedivoire)

In [43]:
final_data["CÃ´te d'Ivoire"] = cotedivoire2

## Uganda

In [44]:
subset_data['Uganda']

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
269,190.0,Uganda,na,oil,percent of population eating fortified vehicle,vitamin a,54.4,48.3,60.4
272,190.0,Uganda,na,oil,percent of population eating industrially prod...,,89.0,84.7,93.2
271,190.0,Uganda,na,oil,percent of population eating vehicle,,89.9,85.9,94.0
300,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,folic acid,8.5,5.7,11.4
301,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,iron,8.5,5.7,11.4
302,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,vitamin a,8.5,5.7,11.4
303,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,vitamin b1,8.5,5.7,11.4
304,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,vitamin b12,8.5,5.7,11.4
305,190.0,Uganda,na,wheat flour,percent of population eating fortified vehicle,zinc,8.5,5.7,11.4
298,190.0,Uganda,na,wheat flour,percent of population eating industrially prod...,,10.6,7.6,13.6


In [45]:
double_check_data("Uganda","oil","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
242,oil,percent of population eating fortified vehicle,vitamin a,54.4,48.3,60.4
244,oil,percent of population eating fortified vehicle,"d, ergocalciferol-D2, cholecalciferol-D3, alfa...",,,
269,oil,percent of population eating fortified vehicle,vitamin a,54.4,48.3,60.4
270,oil,percent of population eating fortified vehicle,"d, ergocalciferol-D2, cholecalciferol-D3, alfa...",0.0,,


In [46]:
# double checked above paper, didn't find any 0 (under uganda / vitamin D, said "--")

In [47]:
k = 1
additional_uganda = pd.DataFrame({'location_id':[190]*k,
                           'location_name':["Uganda"]*k,
                           'vehicle':['oil']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['vitamin d'],
                           'value_mean':[0]*k,
                           'value_025_percentile':[0]*k,
                           'value_975_percentile':[5]*k})
additional_uganda

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,190,Uganda,oil,percent of population eating fortified vehicle,vitamin d,0,0,5


In [48]:
uganda1 = add_additional(subset_data['Uganda'], additional_uganda)
uganda1

Unnamed: 0,location_id,location_name,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
275,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,folic acid,6.5,3.3,9.7
169,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,iron,6.5,3.3,9.7
296,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,vitamin a,6.5,3.3,9.7
294,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,vitamin b1,6.5,3.3,9.7
295,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,vitamin b12,6.5,3.3,9.7
293,190.0,Uganda,na,maize flour,percent of population eating fortified vehicle,zinc,6.5,3.3,9.7
273,190.0,Uganda,na,maize flour,percent of population eating industrially prod...,,42.4,32.7,52.1
274,190.0,Uganda,na,maize flour,percent of population eating vehicle,,91.8,87.7,96.0
269,190.0,Uganda,na,oil,percent of population eating fortified vehicle,vitamin a,54.4,48.3,60.4
0,190.0,Uganda,,oil,percent of population eating fortified vehicle,vitamin d,0.0,0.0,5.0


In [49]:
final_data["Uganda"] = uganda1

## Tanzania

In [50]:
subset_data['United Republic of Tanzania']

Unnamed: 0,location_id,location_name,urbanicity,subnational_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,sub_population,notes,source_citation
213,189.0,United Republic of Tanzania,mixed/both,na,oil,percent of population eating fortified vehicle,vitamin a,53.6,46.4,60.8,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
214,189.0,United Republic of Tanzania,mixed/both,na,oil,percent of population eating industrially prod...,,92.6,89.0,96.3,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
215,189.0,United Republic of Tanzania,mixed/both,na,oil,percent of population eating vehicle,,96.2,93.2,99.2,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
233,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,folic acid,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
234,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,iron,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
235,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,vitamin a,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
236,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,vitamin b1,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
237,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,vitamin b12,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
238,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating fortified vehicle,zinc,33.1,27.5,38.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."
225,189.0,United Republic of Tanzania,mixed/both,na,wheat flour,percent of population eating industrially prod...,,50.5,43.3,57.7,total population,,"Grant J Aaron, Valerie M Friesen, Svenja Jungj..."


In [51]:
# see if we have any info on vitamin d
double_check_data("United Republic of Tanzania","oil","percent of population eating fortified vehicle")

Unnamed: 0,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
213,oil,percent of population eating fortified vehicle,vitamin a,53.6,46.4,60.8
216,oil,percent of population eating fortified vehicle,"d, ergocalciferol-D2, cholecalciferol-D3, alfa...",0.0,,
249,oil,percent of population eating fortified vehicle,vitamin a,53.6,na,na


In [52]:
# double checked above paper, didn't find any 0 (under uganda / vitamin D, said "--")

In [53]:
k = 1
additional_tanzania = pd.DataFrame({'location_id':[189]*k,
                           'location_name':["United Republic of Tanzania"]*k,
                           'vehicle':['oil']*k,
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['vitamin d'],
                           'value_mean':[0]*k,
                           'value_025_percentile':[0]*k,
                           'value_975_percentile':[5]*k})
additional_tanzania

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,189,United Republic of Tanzania,oil,percent of population eating fortified vehicle,vitamin d,0,0,5


In [54]:
tanzania1 = add_additional(subset_data['United Republic of Tanzania'], additional_tanzania)
final_data['United Republic of Tanzania'] = tanzania1

## South Africa

In [55]:
safrica1 = subset_data['South Africa']

In [56]:
# beatrix extracted for multiple nutrient_mass_ppms
safrica1[safrica1.vehicle=="wheat flour"] = safrica1[safrica1.vehicle=="wheat flour"].drop_duplicates()

In [57]:
# we don't care about b2, b6, b3 (beatrix you did this to yourself..)
df.loc[(df.location_name=="South Africa") & (df.vehicle=="maize flour") & (df.value_description=="percent of population eating fortified vehicle"),
       ['subnational_name','nutrient_mass_ppm','nutrient','value_mean','notes']].sort_values(by=['nutrient','subnational_name'])

Unnamed: 0,subnational_name,nutrient_mass_ppm,nutrient,value_mean,notes
745,Eastern Cape,2.2,"b1, thiamine",16.3,"these are 95%, not 97.5% CIs! survey pop = hou..."
756,Gauteng,2.2,"b1, thiamine",77.4,"these are 95%, not 97.5% CIs! survey pop = hou..."
749,Eastern Cape,,b12,,"these are 95%, not 97.5% CIs! survey pop = hou..."
760,Gauteng,,b12,,"these are 95%, not 97.5% CIs! survey pop = hou..."
746,Eastern Cape,1.7,"b2, riboflavin",16.3,"these are 95%, not 97.5% CIs! survey pop = hou..."
757,Gauteng,1.7,"b2, riboflavin",77.4,"these are 95%, not 97.5% CIs! survey pop = hou..."
747,Eastern Cape,25.0,"b3, niacin",16.3,"these are 95%, not 97.5% CIs! survey pop = hou..."
758,Gauteng,25.0,"b3, niacin",77.4,"these are 95%, not 97.5% CIs! survey pop = hou..."
748,Eastern Cape,3.1,"b6, pyridoxine",16.3,"these are 95%, not 97.5% CIs! survey pop = hou..."
759,Gauteng,3.1,"b6, pyridoxine",77.4,"these are 95%, not 97.5% CIs! survey pop = hou..."


In [58]:
# shouldn't have extracted these
safrica1 = safrica1[(~safrica1.nutrient.isin(['vitamin b6', 'vitamin b2','vitamin b3'])) & (safrica.value_mean.notna())]

In [59]:
# assuming maize/b12, wheat/b12 == 0
k = 2
additional_safrica = pd.DataFrame({'location_id':[196]*k,
                           'location_name':["South Africa"]*k,
                           'vehicle':['maize flour', 'wheat flour'],
                           'value_description':['percent of population eating fortified vehicle']*k,
                           'nutrient':['vitamin b12']*k, 
                           'value_mean':[0]*k,
                           'value_025_percentile':[0]*k,
                           'value_975_percentile':[5]*k})
additional_safrica

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,196,South Africa,maize flour,percent of population eating fortified vehicle,vitamin b12,0,0,5
1,196,South Africa,wheat flour,percent of population eating fortified vehicle,vitamin b12,0,0,5


In [60]:
safrica2 = add_additional(safrica1, additional_safrica)
final_data['South Africa'] = safrica2

In [61]:
pd.concat(list(final_data.values()))[['location_name','subnational_name']].drop_duplicates()

Unnamed: 0,location_name,subnational_name
0,India,Rajasthan
2,India,
78,Nigeria,lagos
105,Nigeria,kano
600,CÃ´te d'Ivoire,Abidjan
0,CÃ´te d'Ivoire,
275,Uganda,na
0,Uganda,
219,United Republic of Tanzania,na
0,United Republic of Tanzania,


# weight by subnational

In [62]:
def weighted_average(country, loc_a, loc_b, weight_a, weight_b):
    #subset to cols of interest
    country = country[['location_name', 'subnational_name', 'vehicle', 'value_description', 'nutrient', 'value_mean',
       'value_025_percentile', 'value_975_percentile']]
    
    country_name = country.location_name.unique()[0]
    
    #hold out vars not split into subnats
    no_subnationals = country[country.subnational_name.isna()].drop(columns='subnational_name')
    subnats = country[country.subnational_name.isin([loc_a,loc_b])]
    
    assert (len(no_subnationals)+len(subnats) == len(country)), "unexpected vals in subnational_name column"
    
    #add weights
    subnats['sub_weight'] = [weight_a if loc==loc_a else weight_b for loc in subnats.subnational_name]
    
    #convert to correct type
    for col in ['value_mean','value_025_percentile','value_975_percentile']:
        subnats[col] = subnats[col].astype(float)
        
    #weight
    subnats['value_mean'] = subnats.value_mean * subnats.sub_weight
    subnats['value_025_percentile'] = subnats.value_025_percentile * subnats.sub_weight
    subnats['value_975_percentile'] = subnats.value_975_percentile * subnats.sub_weight

    #make sure there are the same number (2) per
    assert len(np.unique(subnats.groupby(['vehicle','value_description','nutrient']).count().subnational_name))==1
    
    #make sure there is an actual val to group on for rows that dont use nutrient
    subnats.loc[subnats.nutrient.isna(),'nutrient'] = 'na'

    #average
    subnats = subnats.groupby(['vehicle','value_description','nutrient']).sum().reset_index()

    output = pd.concat([no_subnationals, subnats]).sort_values(['vehicle','value_description','nutrient'])
    output['location_name'] = country_name
    
    return output.drop(columns=['sub_weight'])

In [63]:
## combine gauteng and eastern cape

In [64]:
# south africa location_id = 196
# gauteng location_id = 484
# eastern cape location_id = 482

# from db_queries import get_population as go
# go(age_group_id = 22, location_id = [484,482], year_id = 2015, sex_id = 3, gbd_round_id = 7, decomp_step = 'iterative')

gauteng_pop_2015 = 1.340526e+07
easterncape_pop_2015 = 6.468681e+06

In [65]:
loc_a = "Eastern Cape"
loc_b = "Gauteng"
weight_a = easterncape_pop_2015/(gauteng_pop_2015+easterncape_pop_2015)
weight_b = gauteng_pop_2015/(gauteng_pop_2015+easterncape_pop_2015)

sa_reweighted = weighted_average(final_data['South Africa'], loc_a, loc_b, weight_a, weight_b)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://

In [66]:
sa_reweighted['location_id'] = 196
sa_reweighted

Unnamed: 0,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,location_id
0,South Africa,maize flour,percent of population eating fortified vehicle,folic acid,57.5128,50.336,71.7254,196
1,South Africa,maize flour,percent of population eating fortified vehicle,iron,57.5128,50.336,71.7254,196
2,South Africa,maize flour,percent of population eating fortified vehicle,vitamin a,57.5128,50.336,71.7254,196
3,South Africa,maize flour,percent of population eating fortified vehicle,vitamin b1,57.5128,50.336,71.7254,196
0,South Africa,maize flour,percent of population eating fortified vehicle,vitamin b12,0.0,0.0,5.0,196
4,South Africa,maize flour,percent of population eating fortified vehicle,zinc,57.5128,50.336,71.7254,196
5,South Africa,maize flour,percent of population eating industrially prod...,na,96.4741,92.318,98.9208,196
6,South Africa,maize flour,percent of population eating vehicle,na,96.609,92.3855,99.0557,196
7,South Africa,wheat flour,percent of population eating fortified vehicle,folic acid,5.84503,3.25486,9.26539,196
8,South Africa,wheat flour,percent of population eating fortified vehicle,iron,5.84503,3.25486,9.26539,196


In [67]:
final_data['South Africa'] = sa_reweighted

In [68]:
## combine kano and lagos

In [69]:
# nigeria location_id = 214
# kano location_id = 25337
# lagos location_id = 25342

# from db_queries import get_population as go
# go(age_group_id = 22, location_id = [25337,25342], year_id = 2015, sex_id = 3, gbd_round_id = 7, decomp_step = 'iterative')

kano_pop_2015 = 1.389539e+07
lagos_pop_2015 = 7.192101e+06

In [70]:
final_data["Nigeria"].subnational_name.unique()

array(['lagos', 'kano'], dtype=object)

In [71]:
loc_a = "kano"
loc_b = "lagos"
weight_a = kano_pop_2015/(kano_pop_2015+lagos_pop_2015)
weight_b = lagos_pop_2015/(kano_pop_2015+lagos_pop_2015)

nigeria_reweighted = weighted_average(final_data['Nigeria'], loc_a, loc_b, weight_a, weight_b)

In [72]:
nigeria_reweighted.groupby(['vehicle','value_description','nutrient']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,location_name,value_mean,value_025_percentile,value_975_percentile
vehicle,value_description,nutrient,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
maize flour,percent of population eating fortified vehicle,folic acid,1,1,1,1
maize flour,percent of population eating fortified vehicle,iron,1,1,1,1
maize flour,percent of population eating fortified vehicle,vitamin a,1,1,1,1
maize flour,percent of population eating fortified vehicle,vitamin b1,1,1,1,1
maize flour,percent of population eating fortified vehicle,vitamin b12,1,1,1,1
maize flour,percent of population eating fortified vehicle,zinc,1,1,1,1
maize flour,percent of population eating industrially produced vehicle,na,1,1,1,1
maize flour,percent of population eating vehicle,na,1,1,1,1
oil,percent of population eating fortified vehicle,vitamin a,1,1,1,1
oil,percent of population eating fortified vehicle,vitamin d,1,1,1,1


In [73]:
nigeria_reweighted['location_id'] = 214
nigeria_reweighted

Unnamed: 0,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,location_id
0,Nigeria,maize flour,percent of population eating fortified vehicle,folic acid,1.18841,0.593046,1.883774,214
1,Nigeria,maize flour,percent of population eating fortified vehicle,iron,1.18841,0.593046,1.883774,214
2,Nigeria,maize flour,percent of population eating fortified vehicle,vitamin a,1.18841,0.593046,1.883774,214
3,Nigeria,maize flour,percent of population eating fortified vehicle,vitamin b1,1.18841,0.593046,1.883774,214
4,Nigeria,maize flour,percent of population eating fortified vehicle,vitamin b12,1.18841,0.593046,1.883774,214
5,Nigeria,maize flour,percent of population eating fortified vehicle,zinc,1.18841,0.593046,1.883774,214
6,Nigeria,maize flour,percent of population eating industrially prod...,na,8.237413,6.544368,9.996353,214
7,Nigeria,maize flour,percent of population eating vehicle,na,54.965202,52.435732,57.560566,214
8,Nigeria,oil,percent of population eating fortified vehicle,vitamin a,7.463576,5.763576,9.22947,214
9,Nigeria,oil,percent of population eating fortified vehicle,vitamin d,0.0,0.0,5.0,214


In [74]:
final_data['Nigeria'] = nigeria_reweighted

# save and output

In [75]:
output = pd.concat(list(final_data.values()))

In [76]:
output_cols = ['location_id', 'location_name', 'vehicle',
       'value_description', 'nutrient', 'value_mean', 'value_025_percentile',
       'value_975_percentile']
output = output[output_cols].reset_index(drop=True) #need to fix nigeria subnationals

In [77]:
output.loc[output.nutrient.isna(),'nutrient'] = 'na'

In [78]:
for col in ['value_mean','value_025_percentile','value_975_percentile']:
    output[col] = output[col].astype(float)

In [79]:
# correct types

In [80]:
output.dtypes

location_id             float64
location_name            object
vehicle                  object
value_description        object
nutrient                 object
value_mean              float64
value_025_percentile    float64
value_975_percentile    float64
dtype: object

In [81]:
output.location_id = output.location_id.astype(int)
output.location_name = output.location_name.astype(str)
output.vehicle = output.vehicle.astype(str)
output.value_description = output.value_description.astype(str)
output.value_mean = output.value_mean.astype(float)
output.value_025_percentile  = output.value_025_percentile.astype(float)
output.value_975_percentile = output.value_975_percentile.astype(float)

In [82]:
# quick checks

In [83]:
assert len(output[output.value_mean < output.value_025_percentile])==0, "some mean/lower is incorrrect"
assert len(output[output.value_mean > output.value_975_percentile])==0 ,"some mean/upper is incorrrect"

In [84]:
output.head()

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile
0,163,India,oil,percent of population eating fortified vehicle,vitamin a,24.3,21.1,27.9
1,163,India,oil,percent of population eating fortified vehicle,vitamin d,24.3,21.1,27.9
2,163,India,oil,percent of population eating industrially prod...,na,89.4,87.0,91.8
3,163,India,oil,percent of population eating vehicle,na,100.0,100.0,100.0
4,163,India,wheat flour,percent of population eating fortified vehicle,folic acid,6.3,4.8,7.9


In [85]:
# want to check, for every country-vehicle pair, there exists rows for all three val descrip

In [86]:
output.groupby(['location_name','vehicle']).nunique().value_description

location_name                vehicle    
CÃ´te d'Ivoire               oil            3
India                        oil            3
                             wheat flour    3
Nigeria                      maize flour    3
                             oil            3
                             wheat flour    3
South Africa                 maize flour    3
                             wheat flour    3
Uganda                       maize flour    3
                             oil            3
                             wheat flour    3
United Republic of Tanzania  maize flour    3
                             oil            3
                             wheat flour    3
Name: value_description, dtype: int64

In [87]:
# check values are uniform
output.nutrient.unique()

array(['vitamin a', 'vitamin d', 'na', 'folic acid', 'iron', 'vitamin b1',
       'vitamin b12', 'zinc'], dtype=object)

In [88]:
# make sure nutrient=='na' iff val_descip == percent of population eating fortified vehicle
output[output.nutrient=='na'].value_description.unique()

array(['percent of population eating industrially produced vehicle',
       'percent of population eating vehicle'], dtype=object)

In [89]:
output[output.nutrient!='na'].value_description.unique()

array(['percent of population eating fortified vehicle'], dtype=object)

In [90]:
# check that there is only one location_id per location_name
output.groupby(['location_name']).nunique().location_id.nunique()

1

In [91]:
# check that location_id!=na
output.location_id.unique()

array([163, 214, 205, 190, 189, 196])

In [92]:
## check every country-nutrient and nutrient-pair that we have is one we want
import pickle
data_prep_dir = '/ihme/scratch/users/beatrixh/vivarium_data_analysis/pre_processing/lsff_project/data_prep/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_pairs = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_pairs = pickle.load(handle)

In [93]:
country_vehicle_pairs["CÃ´te d'Ivoire"] = ['wheat flour', 'oil', 'maize flour', 'bouillon']

In [94]:
for country in output.location_name.unique():
    for vehicle in output.loc[output.location_name==country].vehicle.unique():
        assert vehicle in country_vehicle_pairs[country], f'error: {country}, {vehicle}'

In [95]:
for vehicle in output.vehicle.unique():
    for nutrient in output.loc[output.vehicle==vehicle].nutrient.unique():
        if nutrient!='na':
            assert nutrient in vehicle_nutrient_pairs[vehicle], f'error: {vehicle}, {nutrient}'

In [96]:
output.loc[output.location_name=="CÃ´te d'Ivoire",'location_name'] = "Côte d'Ivoire"

In [97]:
output.location_name.unique()

array(['India', 'Nigeria', "Côte d'Ivoire", 'Uganda',
       'United Republic of Tanzania', 'South Africa'], dtype=object)

In [98]:
check_path = '/ihme/homes/beatrixh/repos/scratch/check_lsff_extraction.csv'
output.to_csv(check_path, index = False, encoding='utf-8')

In [99]:
save_path = '/ihme/homes/beatrixh/notebooks/viv_rsc/new_lsff/outputs/LSFF_extraction_clean_data_rich_locations_01_11_2021.csv'
output.to_csv(save_path, index = False)

In [100]:
output.location_name.unique()

array(['India', 'Nigeria', "Côte d'Ivoire", 'Uganda',
       'United Republic of Tanzania', 'South Africa'], dtype=object)

In [104]:
a = pd.read_csv(save_path)
a.location_name.unique()

array(['India', 'Nigeria', "Côte d'Ivoire", 'Uganda',
       'United Republic of Tanzania', 'South Africa'], dtype=object)