In [1]:
from db_queries import get_population, get_ids
from db_queries import get_location_metadata as get_locs

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

# Clean all gday data: minimal

In [3]:
## load targets
import pickle
data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)

In [4]:
ls /ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/ | grep gday_extraction_

gday_extraction_sheet_02_19_2021.csv
gday_extraction_sheet_02_22_2021.csv
gday_extraction_sheet_03_16_2021.csv
gday_extraction_sheet_03_22_2021.csv
gday_extraction_sheet_03_24_2021.csv
gday_extraction_sheet_03_29_2021.csv


In [5]:
gday_path =  '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/gday_extraction_sheet_03_29_2021.csv'
gday = pd.read_csv(gday_path)

assert(len(gday[gday.location_name.isna()])==0), "Some rows missing location name"

In [6]:
gday.location_name = gday.location_name.str.strip(' ')
gday.subnational_name = gday.subnational_name.str.strip(' ')
gday.vehicle = gday.vehicle.str.strip(' ')
gday.nutrient = gday.nutrient.str.strip(' ')
gday.urbanicity = gday.urbanicity.str.strip(' ')
gday.nutrient = gday.nutrient.str.strip(' ')

In [7]:
# location_names = ['Pakistan','Bangladesh','United Republic of Tanzania','Uganda','South Africa']
# location_names = ['Kenya', 'Burkina Faso', 'Myanmar', 'Vietnam', 'Nepal']

location_names = ['Angola', 'Bangladesh', 'Burkina Faso', 'Cameroon', 'China',
       "Côte d'Ivoire", 'Democratic Republic of the Congo', 'Egypt',
       'Ethiopia', 'Ghana', 'India', 'Indonesia', 'Kenya', 'Madagascar',
       'Mozambique', 'Myanmar', 'Nepal', 'Niger', 'Nigeria', 'Pakistan',
       'South Africa', 'Sudan', 'Uganda', 'United Republic of Tanzania',
       'Vietnam']
vehicles = ['wheat flour','maize flour','oil']

In [8]:
# these are the vehicles per country we need
target = pd.DataFrame([(loc,v) for loc in location_names for v in country_vehicle_map[loc]],
            columns=['location_name','vehicle']).sort_values(['location_name','vehicle']).set_index(['location_name','vehicle'])

In [9]:
# estimate CIs, crude

# clean value_mean
gday.loc[gday.value_mean=='na','value_mean'] = np.nan
gday.value_mean = gday.value_mean.astype(float)

# clean 2.5th %ile
gday.loc[gday.value_025_percentile=='na','value_025_percentile'] = np.nan
gday.value_025_percentile = gday.value_025_percentile.astype(float)

# clean 97.5th %ile
gday.loc[gday.value_975_percentile=='na','value_975_percentile'] = np.nan
gday.value_975_percentile = gday.value_975_percentile.astype(float)

# calc scale_over_mean
gday['scale_over_mean'] = (gday.value_975_percentile - gday.value_025_percentile) / gday.value_mean

#foreach vehicle, average scale_over_mean
r = gday[['vehicle','scale_over_mean']].groupby('vehicle').mean().dropna().rename(columns={'scale_over_mean':'r'})

In [10]:
# these are the vehicles for which we have a scale_over_mean estimate
r

Unnamed: 0_level_0,r
vehicle,Unnamed: 1_level_1
bouillon,0.902591
wheat flour,0.699893


In [11]:
# for vehicles without a scale_over_mean value, we'll assign the average of the others (row-wise)
r = r.reset_index().append(pd.DataFrame([(i,gday.scale_over_mean.mean()) for i in ['maize flour', 'wheat(not specifically flour)','salt', 'rice']],
            columns = ['vehicle','r']))
r

Unnamed: 0,vehicle,r
0,bouillon,0.902591
1,wheat flour,0.699893
0,maize flour,0.819483
1,wheat(not specifically flour),0.819483
2,salt,0.819483
3,rice,0.819483


In [12]:
# add uncertainty
gday = gday.merge(r, on = 'vehicle', how = 'outer')
gday['lower'] = gday.value_mean - (gday.r * gday.value_mean)/2
gday['upper'] = gday.value_mean + (gday.r * gday.value_mean)/2

In [13]:
gday.value_description.unique()

array([nan, 'Mean daily consumption (mg)',
       'Mean per capita consumption (g/day)',
       'Median amount of vehicle consumed on previous day among consumers (g/day)',
       'Mean amount of vehicle consumed on previous day among consumers (g/day)',
       'Estimated daily contribution from fortified foods (mg/d)',
       'kg/capita/year',
       'Mean per capita consumption among consumers (g/day)',
       'Consumption per person per day (g)',
       'Median daily contribution from fortified foods among consumers (mg/day)'],
      dtype=object)

In [14]:
## dicts for var cleaning

value_d_to_metric = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'median',
 'Mean per capita consumption (g/day)': 'mean',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'mean',
 'Mean micronutrient intake per capita (mg/day)': 'mean',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK',
 'Mean per capita consumption among consumers (g/day)':'mean'
}

value_d_to_entity = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean per capita consumption (g/day)': 'CHECK',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'vehicle',
 'Mean micronutrient intake per capita (mg/day)': 'nutrient',
 'Daily per capita consumption (g)': 'CHECK',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'CHECK',
 'Mean per capita consumption among consumers (g/day)':'CHECK'
}

value_d_to_mass_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean per capita consumption (g/day)': 'g',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'g',
 'Mean micronutrient intake per capita (mg/day)': 'mg',
 'Daily per capita consumption (g)': 'g',
 'Daily consumption (mg/d)': 'mg',
 'Consumption per person per day (g)': 'g',
 'Estimated daily contribution from fortified foods (mg/d)': 'mg',
 'kg/capita/year': 'kg',
 'Mean per capita consumption among consumers (g/day)':'g'
}

value_d_to_time_unit = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean per capita consumption (g/day)': 'day',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'day',
 'Mean micronutrient intake per capita (mg/day)': 'day',
 'Daily per capita consumption (g)': 'day',
 'Daily consumption (mg/d)': 'day',
 'Consumption per person per day (g)': 'day',
 'Estimated daily contribution from fortified foods (mg/d)': 'day',
 'kg/capita/year': 'year',
 'Mean per capita consumption among consumers (g/day)':'day'
}

value_d_to_population = {
 'Median amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean per capita consumption (g/day)': 'capita',
 'Mean amount of vehicle consumed on previous day among consumers (g/day)': 'consumers',
 'Mean micronutrient intake per capita (mg/day)': 'capita',
 'Daily per capita consumption (g)': 'capita',
 'Daily consumption (mg/d)': 'CHECK',
 'Consumption per person per day (g)': 'CHECK',
 'Estimated daily contribution from fortified foods (mg/d)': 'CHECK',
 'kg/capita/year': 'capita',
 'Mean per capita consumption among consumers (g/day)':'consumers'
}

In [15]:
def format_value_d(df):
    df['metric'] = df.value_description.map(value_d_to_metric)
    df['entity'] = df.value_description.map(value_d_to_entity)
    df['mass_unit'] = df.value_description.map(value_d_to_mass_unit)
    df['time_unit'] = df.value_description.map(value_d_to_time_unit)
    df['pop_denom'] = df.value_description.map(value_d_to_population)
    
    return df

In [16]:
gday = format_value_d(gday)

# Format output

In [17]:
group_cols = ['location_id','location_name','sub_population','vehicle','metric','mass_unit','time_unit','pop_denom','source_citation','source_link','data_choice_notes']

In [18]:
loc_metadata = get_locs(location_set_id=35, gbd_round_id=6, decomp_step="step4")

In [19]:
loc_metadata = loc_metadata.loc[(loc_metadata.location_name.isin(location_names + ['Viet Nam'])) & (loc_metadata.level==3),['location_id','location_name']]

In [20]:
gday = gday.drop(columns='location_id')

In [21]:
gday = loc_metadata.merge(gday, on = 'location_name', how = 'right')

In [22]:
gday.location_id = gday.location_id.fillna(-1).astype(int)

In [23]:
# dropping six rows
gday = gday[(gday.location_id!=-1)]

In [24]:
gday.loc[(gday.mass_unit=="kg"),'value_mean'] = gday.value_mean * 1_000

gday.loc[(gday.mass_unit=="kg"),'lower'] = gday.lower * 1_000
gday.loc[(gday.mass_unit=="kg"),'upper'] = gday.upper * 1_000

gday.loc[(gday.mass_unit=="kg"),'value_025_percentile'] = gday.value_025_percentile * 1_000
gday.loc[(gday.mass_unit=="kg"),'value_975_percentile'] = gday.value_975_percentile * 1_000

gday.loc[(gday.mass_unit=="kg"),'mass_unit'] = 'g'

In [25]:
gday.loc[(gday.time_unit=="year"),'value_mean'] = gday.value_mean / 365

gday.loc[(gday.time_unit=="year"),'lower'] = gday.lower / 365
gday.loc[(gday.time_unit=="year"),'upper'] = gday.upper / 365

gday.loc[(gday.time_unit=="year"),'value_025_percentile'] = gday.value_025_percentile / 365
gday.loc[(gday.time_unit=="year"),'value_975_percentile'] = gday.value_975_percentile / 365

gday.loc[(gday.time_unit=="year"),'time_unit'] = 'day'

In [26]:
assert(len(gday[gday.value_mean >= gday.value_975_percentile])==0), "check upper"
assert(len(gday[gday.value_mean <= gday.value_025_percentile])==0), "check lower"

In [27]:
assert(len(gday[gday.value_mean >= gday.upper])==0), "check upper"
assert(len(gday[gday.value_mean <= gday.lower])==0), "check lower"

In [28]:
gday.loc[gday.value_025_percentile.isna(),'value_025_percentile'] = gday.lower
gday.loc[gday.value_975_percentile.isna(),'value_975_percentile'] = gday.upper

In [29]:
assert(len(gday[(gday.value_025_percentile.isna()) | (gday.value_975_percentile.isna())])==0), "missing CIs"

In [30]:
gday = gday.drop(columns = ['lower','upper'])

In [31]:
output = gday[['location_id', 'location_name', 'subnational_name',
       'subnational_location_id', 'urbanicity','sub_population', 'vehicle', 'nutrient',
       'value_description', 'metric', 'entity',
       'mass_unit', 'time_unit', 'pop_denom','value_mean',
       'value_025_percentile', 'value_975_percentile', 'source_citation',
       'source_link', 'source_year', 'source_type', 'notes', 'user',
       'date_recorded', 'definition validated', 'CI validated',
       'Validation notes', ]]

In [32]:
output = output.sort_values(['location_name','vehicle'])

In [33]:
save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/gday_input_data_all_2021_04_02.csv'
output.to_csv(save_path, index = False)

In [34]:
save_path = '/ihme/homes/beatrixh/repos/scratch/gday_input_data_all_2021_04_02.csv'
output.to_csv(save_path, index = False)