In [1]:
import pandas as pd, numpy as np

# LSFF: basic data cleaning

IN: extraction sheet + vehicle-nutrient, country-vehicle dicts

OUT: cleaned extraction sheet. values renamed, dropped all illegal location-vehicle-val-nutrient combos

In [2]:
## load targets
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)

In [3]:
countries = [i for i in country_vehicle_map.keys() if type(i)==str]

In [4]:
path = data_prep_dir + 'extraction_sheet_lsff_03_11_2021.csv'
df = pd.read_csv(path, encoding = 'utf-8')

In [5]:
df.head()

Unnamed: 0,location_id,location_name,subnational_name,subnational_location_id,urbanicity,vehicle,nutrient,nutrient_compound,nutrient_mass_ppm,fortification_standards,...,sub_population,source_citation,source_link,source_year,source_type,source_additional,notes,user,date_recorded,Validated
0,163.0,India,Rajasthan,,mixed/both,oil,vitamin a,unknown,7.5,voluntary,...,0-24 months,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,2013-2014,,,,nathaniel,10/30/2020,Yes
1,163.0,India,Rajasthan,,mixed/both,oil,na,na,na,na,...,0-24 months,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,2017,,,,nathaniel,10/30/2020,Yes
2,168.0,Angola,na,,mixed/both,maize flour,iron,unknown,na,unknown,...,total population,GFDx,https://fortificationdata.org/country-fortific...,2018,,,"Scott Montgomery, Food Fortification Initiativ...",paulina,3/9/2021,
3,168.0,Angola,na,,mixed/both,maize flour,"folic acid, folate, b9",na,na,unknown,...,total population,GFDx,https://fortificationdata.org/country-fortific...,2018,,,"Scott Montgomery, Food Fortification Initiativ...",paulina,3/9/2021,
4,168.0,Angola,na,,mixed/both,maize flour,zinc,na,na,unknown,...,total population,GFDx,https://fortificationdata.org/country-fortific...,2018,,,"Scott Montgomery, Food Fortification Initiativ...",paulina,3/9/2021,


In [6]:
path_assumed_data = data_prep_dir + 'extraction_sheet_lsff_assumed_coverage_03_11_2021.csv'
assumed_df = pd.read_csv(path_assumed_data, encoding = 'utf-8')

In [7]:
assumed_df['is_estimate'] = 1
df['is_estimate'] = 0

In [8]:
df = df.append(assumed_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [9]:
check_cols = ['location_id','location_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','notes','is_estimate']

In [10]:
# format vars
df.location_id = df.location_id.fillna(-1).astype(int)
df.location_name = df.location_name.astype(str)
df.vehicle = df.vehicle.astype(str)
df.value_description = df.value_description.astype(str)

In [11]:
df.value_mean = df.value_mean.mask(df.value_mean.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)
df.value_025_percentile = df.value_025_percentile.mask(df.value_025_percentile.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)
df.value_975_percentile = df.value_975_percentile.mask(df.value_975_percentile.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)

In [12]:
#drop all NaN-only rows
df = df.loc[~((df.value_mean.isna())&(df.value_025_percentile.isna())&(df.value_975_percentile.isna()))]

In [13]:
#locations for which need tofill in loc_ids
df[df.location_id==-1].location_name.unique()

array(['India', 'Angola', 'Bangladesh', 'Cameroon', 'Nigeria', 'Yemen',
       'Pakistan', 'Burkina Faso', "Côte d'Ivoire", 'Indonesia', 'Uganda',
       'United Republic of Tanzania', 'Ethiopia', 'China',
       'Democratic Republic of the Congo', 'Egypt', 'Ghana', 'Niger',
       'Sudan', 'Viet Nam', 'Kenya', 'Madagascar', 'Mozambique', 'Nepal',
       'Myanmar', 'South Africa', 'Philippines'], dtype=object)

In [14]:
#looks like we'll need to estimate mean here
df.loc[df.value_mean.isna(),check_cols]

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,notes,is_estimate
102,-1,Burkina Faso,wheat flour,percent of vehicle that is fortified,iron,,0.0,10.0,Assumed to be % of industrially processed whea...,0
122,-1,Côte d'Ivoire,wheat flour,percent of vehicle that is fortified,iron,,10.0,40.0,Assumed to be % of industrially processed whea...,0
177,-1,Cameroon,wheat flour,percent of vehicle that is fortified,iron,,0.0,10.0,Assumed to be % of industrially processed whea...,0
289,-1,Ghana,rice,percent of vehicle that is industrially produced,na,,11.0,23.0,"% industrially milled, only a range was given",0
333,-1,Ghana,wheat flour,percent of vehicle that is fortified,iron,,25.0,35.0,Assumed to be % of industrially processed whea...,0
447,-1,Indonesia,wheat flour,percent of vehicle that is fortified,iron,,90.0,100.0,Assumed to be % of industrially processed whea...,0
527,-1,Kenya,wheat flour,percent of vehicle that is fortified,iron,,90.0,100.0,Assumed to be % of industrially processed whea...,0
563,-1,Mozambique,wheat flour,percent of vehicle that is fortified,iron,,90.0,100.0,Assumed to be % of industrially processed whea...,0
608,-1,Niger,wheat flour,percent of vehicle that is fortified,iron,,0.0,0.0,Assumed to be % of industrially processed whea...,0
646,-1,Nigeria,rice,percent of vehicle that is industrially produced,na,,12.0,24.0,"% industrially milled, only a range was given",0


In [15]:
df.loc[df.value_mean.isna(),'value_mean'] = df.loc[df.value_mean.isna(),['value_025_percentile','value_975_percentile']].mean(axis = 1)

In [16]:
df.loc[df.value_mean.isna(),check_cols]

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,notes,is_estimate


## location_name

In [17]:
df.location_name.unique()

array(['India', 'Angola', 'Bangladesh', 'Cameroon', 'Nigeria', 'Yemen',
       'Pakistan', 'Burkina Faso', "Côte d'Ivoire", 'Indonesia', 'Uganda',
       'United Republic of Tanzania', 'Ethiopia', 'China',
       'Democratic Republic of the Congo', 'Egypt', 'Ghana', 'Niger',
       'Sudan', 'Viet Nam', 'Kenya', 'Madagascar', 'Mozambique', 'Nepal',
       'Myanmar', 'South Africa', 'Philippines'], dtype=object)

In [18]:
relabel_location_name = {'India': 'India',
 'Bangladesh': 'Bangladesh',
 'Cameroon': 'Cameroon',
 'Nigeria': 'Nigeria',
 'Pakistan': 'Pakistan',
 'Burkina Faso': 'Burkina Faso',
 "CÃ´te d'Ivoire": "Côte d'Ivoire",
 "Côte d'Ivoire":"Côte d'Ivoire",
 'Uganda': 'Uganda',
 'United Republic of Tanzania': 'United Republic of Tanzania',
 'Ethiopia': 'Ethiopia',
 'China': 'China',
 'Indonesia': 'Indonesia',
 'Democratic Republic of the Congo': 'Democratic Republic of the Congo',
 'Kenya': 'Kenya',
 'Egypt': 'Egypt',
 'Niger': 'Niger',
 'Philippines': 'Philippines',
 'Ghana': 'Ghana',
 'Afghanistan': 'Afghanistan',
 'Viet Nam': 'Vietnam',
 'Yemen': 'Yemen',
 'Sudan': 'Sudan',
 'Angola': 'Angola',
 'Iraq':'Iraq',
 'Madagascar': 'Madagascar',
 'Mozambique': 'Mozambique',
 'Nepal': 'Nepal',
 'Myanmar': 'Myanmar',
 'South Africa': 'South Africa'}

In [19]:
df.location_name = df.location_name.map(relabel_location_name)

In [20]:
## exclude countries the gates foundation had us drop (politically unstable or low rates of maternal and child anemia)
excluded_countries = ['Afghanistan','Yemen','Philippines','Iraq']
df = df[~df.location_name.isin(excluded_countries)]

In [21]:
## clean country names
assert df.location_name.nunique()==25, "wrong number of countries"

In [22]:
for i in df.location_name.unique():
    if i not in countries:
        print(i)
    assert(i in countries)

## vehicle

In [23]:
##this one looks fine
df.vehicle.unique()

array(['oil', 'maize flour', 'wheat flour', 'salt', 'rice', 'bouillon',
       'wheat '], dtype=object)

## nutrient

In [24]:
nutrient_map = {'vitamin a': 'vitamin a',
 'na': 'na',
 'iron': 'iron',
 'folic acid, folate, b9': 'folic acid',
 'b12': 'vitamin b12',
 'iodine': 'iodine',
 'b1, thiamine': 'vitamin b1',
 'zinc': 'zinc',
 'd, ergocalciferol-D2, cholecalciferol-D3, alfacalcidol': 'vitamin d',
 'b2, riboflavin': 'vitamin b2',
 'b3, niacin': 'vitamin b3',
 'b6, pyridoxine': 'vitamin b6'}

In [25]:
df.nutrient = df.nutrient.map(nutrient_map)

In [26]:
# fill nans
df['nutrient'] = df['nutrient'].fillna('na')

## value_description

In [27]:
df.value_description.unique()

array(['percent of population eating fortified vehicle',
       'percent of population eating industrially produced vehicle',
       'percent of vehicle that is fortified',
       'percent of vehicle that is industrially produced',
       'percent of population eating vehicle',
       'percent of marketshare of fortified products',
       'percent of population eating adequately fortified vehicle'],
      dtype=object)

## eliminate country-vehicle and vehicle-nutrient pairs we're uninterested in

In [28]:
country_vehicle_pairs = pd.DataFrame([(i,j) for i in countries for j in country_vehicle_map[i]], columns = ['location_name','vehicle'])

In [29]:
vehicles = list(vehicle_nutrient_map.keys())
vehicle_nutrient_pairs = pd.DataFrame([(i,j) for i in vehicles for j in vehicle_nutrient_map[i]], columns = ['vehicle','nutrient'])
vehicle_nutrient_pairs = pd.concat([vehicle_nutrient_pairs, pd.DataFrame([(i,'na') for i in vehicles], columns = ['vehicle','nutrient'])])

In [30]:
# check what we're removing
test = df.merge(vehicle_nutrient_pairs, on = ['vehicle','nutrient'], how = 'right')
[(i,j) for (i,j) in zip(df.vehicle,df.nutrient) if (i,j) not in zip(test.vehicle,test.nutrient)]

[('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('wheat ', 'iron'),
 ('wheat ', 'folic acid'),
 ('wheat ', 'zinc'),
 ('salt', 'iodine'),
 ('wheat ', 'na'),
 ('rice', 'vitamin a'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('wheat ', 'iron'),
 ('wheat ', 'zinc'),
 ('wheat ', 'folic acid'),
 ('wheat ', 'iron'),
 ('wheat ', 'zinc'),
 ('wheat ', 'folic acid'),
 ('wheat ', 'na'),
 ('salt', 'iodine'),
 ('salt', 'iodine'),
 ('wheat ', 'iron'),
 ('wheat ', 'folic acid'),
 ('wheat ', 'zinc'),
 ('wheat ', 'na')]

In [31]:
df = df.merge(country_vehicle_pairs, on = ['location_name','vehicle'], how = 'right') #should this be inner?

In [32]:
df = df.merge(vehicle_nutrient_pairs, on = ['vehicle','nutrient'], how = 'right')

In [33]:
#drop all NaN-only rows
df = df.loc[~((df.value_mean.isna())&(df.value_025_percentile.isna())&(df.value_975_percentile.isna()))]

## clean illegal value-nutrient combos

In [34]:
nutrient_irrelevant_vds = ['percent of population eating industrially produced vehicle',
       'percent of population eating vehicle',
       'percent of vehicle that is industrially produced',
       'percent of marketshare of fortified products']

nutrient_relevant_vds = ['percent of population eating fortified vehicle',
       'percent of vehicle that is fortified',
       'percent of marketshare of fortified products']

In [35]:
df.nutrient.unique()

array(['vitamin a', 'na', 'vitamin d', 'iron', 'folic acid', 'zinc',
       'vitamin b1', 'vitamin b12'], dtype=object)

In [36]:
df.loc[(df.value_description.isin(nutrient_irrelevant_vds))&(df.nutrient!='na'),check_cols]

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,notes,is_estimate
31,-1.0,Côte d'Ivoire,oil,percent of marketshare of fortified products,vitamin a,89.0,,,,0.0


In [37]:
# relabel nutrients for rows for which nutrient doesn't apply
df.loc[(df.value_description.isin(nutrient_irrelevant_vds))&(df.nutrient!='na'),'nutrient'] = 'na'

In [38]:
# these need to be re extracted; for now we'll drop them
df.loc[(df.value_description.isin(nutrient_relevant_vds)) & (df.nutrient=='na'),check_cols + ['user']].sort_values(['user','location_name'])

Unnamed: 0,location_id,location_name,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,notes,is_estimate,user
31,-1.0,Côte d'Ivoire,oil,percent of marketshare of fortified products,na,89.0,,,,0.0,paulina
817,-1.0,India,salt,percent of population eating fortified vehicle,na,87.4,,,Households were defined as one person living a...,0.0,paulina
821,-1.0,India,salt,percent of population eating fortified vehicle,na,87.4,,,Households were defined as one person living a...,0.0,paulina


In [39]:
# drop rows that need nutrient filled in
df = df.loc[~((df.value_description.isin(nutrient_relevant_vds)) & (df.nutrient=='na')),]

In [40]:
# format vars
df.location_id = df.location_id.fillna(-1).astype(int)
df.location_name = df.location_name.astype(str)
df.vehicle = df.vehicle.astype(str)
df.value_description = df.value_description.astype(str)

df.value_mean = df.value_mean.mask(df.value_mean.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)
df.value_025_percentile = df.value_025_percentile.mask(df.value_025_percentile.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)
df.value_975_percentile = df.value_975_percentile.mask(df.value_975_percentile.isin(['na',np.nan,-1,'unknown']), np.nan).astype(float)

In [41]:
## TODO: fix df.loc[df.location_id==-1,]

## save 

In [42]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_data_stage0_3_11_2021.csv'
df.to_csv(save_path, index = False)