In [1]:
import pandas as pd, numpy as np

# LSFF: Generate data summary tables

In [2]:
## read in data
path = '/ihme/homes/beatrixh/repos/scratch/stage0_data_lsff_2021_01_13.csv'
df = pd.read_csv(path)

In [3]:
## load targets
import pickle
data_prep_dir = '/ihme/scratch/users/beatrixh/vivarium_data_analysis/pre_processing/lsff_project/data_prep/'

with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
    vehicle_nutrient_map = pickle.load(handle)
    
with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
    country_vehicle_map = pickle.load(handle)

countries = [i for i in country_vehicle_map.keys() if type(i)==str]

In [4]:
## for convenience
check_cols = ['location_id','location_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','notes']

In [None]:
## check that we have nutrients iff expected

In [6]:
df[df.nutrient=='na'].value_description.unique()

array(['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle'], dtype=object)

In [7]:
df[df.nutrient!='na'].value_description.unique()

array(['percent of population eating fortified vehicle',
       'percent of vehicle that is fortified'], dtype=object)

In [8]:
# df.loc[(df.value_description.isin(['percent of vehicle that is industrially produced','percent of population eating vehicle',
#                                   'percent of population eating industrially produced vehicle'])),'nutrient'] = 'na'

In [None]:
## relabel vals

In [9]:
#https://journals.sagepub.com/doi/pdf/10.1177/15648265120334S307
# cote divoire,
# "During the last year of the project, 206,410 MT of
# vegetable oil was marketed in Côte d’Ivoire. The fortified
# vegetable oil was covering 89% of the market."

val_descrip_map = {'percent of population eating fortified vehicle': 'percent of population eating fortified vehicle',
 'percent of vehicle that is industrially produced': 'percent of vehicle that is industrially produced',
 'percent of vehicle that is fortified': 'percent of vehicle that is fortified',
 'percent of population eating industrially produced vehicle': 'percent of population eating industrially produced vehicle',
 'percent of marketshare of fortified products': 'percent of market covered by fortified product',
 'percent of population eating vehicle': 'percent of population eating vehicle'}

df.value_description = df.value_description.map(val_descrip_map)

In [10]:
## build frame containing all the county - vehicle - val_des - nutrient combinations we need

In [11]:
nutrient_relevant_vds = ['percent of population eating fortified vehicle',
       'percent of vehicle that is fortified',
       'percent of market covered by fortified product']
data_counts_a = pd.DataFrame([(i,j,k,l) for i in countries for j in country_vehicle_map[i] for k in vehicle_nutrient_map[j]
                             for l in nutrient_relevant_vds],
                            columns = ['location_name','vehicle','nutrient','value_description'])

In [12]:
nutrient_irrelevant_vds = ['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']

data_counts_b = pd.DataFrame([(i,j,'na',k) for i in countries for j in country_vehicle_map[i] for k in nutrient_irrelevant_vds],
                          columns = ['location_name','vehicle','nutrient','value_description'])

In [13]:
# for each country - vehicle - value_desciption - nutrient, count how many datapoints we have
data_counts = data_counts_a.append(data_counts_b).sort_values(by=['location_name','vehicle','value_description','nutrient'])

In [14]:
data_counts.head()

Unnamed: 0,location_name,vehicle,nutrient,value_description
809,Angola,maize flour,folic acid,percent of market covered by fortified product
812,Angola,maize flour,iron,percent of market covered by fortified product
824,Angola,maize flour,vitamin a,percent of market covered by fortified product
818,Angola,maize flour,vitamin b1,percent of market covered by fortified product
821,Angola,maize flour,vitamin b12,percent of market covered by fortified product


In [16]:
## merge the data we have onto the frame

In [17]:
merge_cols = ['location_name','vehicle','value_description','nutrient']
data_counts = data_counts.merge(df[merge_cols + ['value_mean']], on = merge_cols, how = 'left')

In [18]:
data_counts

Unnamed: 0,location_name,vehicle,nutrient,value_description,value_mean
0,Angola,maize flour,folic acid,percent of market covered by fortified product,
1,Angola,maize flour,iron,percent of market covered by fortified product,
2,Angola,maize flour,vitamin a,percent of market covered by fortified product,
3,Angola,maize flour,vitamin b1,percent of market covered by fortified product,
4,Angola,maize flour,vitamin b12,percent of market covered by fortified product,
...,...,...,...,...,...
1604,Vietnam,wheat flour,vitamin b1,percent of vehicle that is fortified,0.0
1605,Vietnam,wheat flour,vitamin b12,percent of vehicle that is fortified,0.0
1606,Vietnam,wheat flour,zinc,percent of vehicle that is fortified,0.0
1607,Vietnam,wheat flour,na,percent of vehicle that is industrially produced,100.0


In [None]:
# group and add columns for:
    # number of datapoints we have per desired val
    # a list of mean_values for all such data points

In [25]:
data_counts_detail = data_counts.groupby(merge_cols).aggregate(lambda x : x.tolist())
data_counts_n = data_counts.groupby(merge_cols).count()

data_counts_all = data_counts_n.merge(data_counts_detail, on = merge_cols, how = 'left')
data_counts_all = data_counts_all.rename(columns={'value_mean_x':'n_data_points','value_mean_y':'value_means'})
data_counts_all = data_counts_all.reset_index()

In [29]:
data_counts_all.value_description.unique()

array(['percent of market covered by fortified product',
       'percent of population eating fortified vehicle',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle',
       'percent of vehicle that is fortified',
       'percent of vehicle that is industrially produced'], dtype=object)

## save data summaries

In [None]:
save_dir = '/ihme/scratch/users/beatrixh/vivarium_data_analysis/pre_processing/lsff_project/data_prep/'

## eating fortified vehicle

In [30]:
eating_fortified = data_counts_all[data_counts_all.value_description=="percent of population eating fortified vehicle"]
eating_fortified.to_csv(save_dir + '/data_summary_percent_of_population_eating_fortified_vehicle.csv')

## eating industrially produced vehicle

In [35]:
eating_ind_prod = data_counts_all[data_counts_all.value_description=="percent of population eating industrially produced vehicle"]
eating_ind_prod.to_csv(save_dir + '/data_summary_percent_of_population_eating_industrially_prod_vehicle.csv')

## eating vehicle

In [37]:
eating_vehicle = data_counts_all[data_counts_all.value_description=="percent of population eating vehicle"]

#subset NAs to only those countries-vehicles where we don't have
#(% of population eating fortified) & (% eating industrially produced)


In [69]:
data_counts_wide = pd.pivot_table(data_counts_all, index = ['location_name','vehicle','nutrient'], columns = 'value_description', values = 'n_data_points')
data_counts_wide = data_counts_wide.reset_index()

value_description,location_name,vehicle,nutrient,percent of market covered by fortified product,percent of population eating fortified vehicle,percent of population eating industrially produced vehicle,percent of population eating vehicle,percent of vehicle that is fortified,percent of vehicle that is industrially produced
0,Angola,maize flour,folic acid,0.0,0.0,,,0.0,
1,Angola,maize flour,iron,0.0,0.0,,,0.0,
2,Angola,maize flour,na,,,0.0,0.0,,1.0
3,Angola,maize flour,vitamin a,0.0,0.0,,,0.0,
4,Angola,maize flour,vitamin b1,0.0,0.0,,,0.0,
...,...,...,...,...,...,...,...,...,...
426,Vietnam,wheat flour,na,,,0.0,10.0,,2.0
427,Vietnam,wheat flour,vitamin a,0.0,0.0,,,1.0,
428,Vietnam,wheat flour,vitamin b1,0.0,0.0,,,1.0,
429,Vietnam,wheat flour,vitamin b12,0.0,0.0,,,1.0,


In [70]:
data_counts_wide['needs_pct_pop_eating_vehicle'] = data_counts_wide[['percent of population eating industrially produced vehicle','percent of population eating fortified vehicle']].sum(axis=1)

In [78]:
needs_pct_pop_eating_vehicle = data_counts_wide.groupby(['location_name','vehicle']).needs_pct_pop_eating_vehicle.min().reset_index() # find vehicle-country pairs for which any datapoint is missing (min count of datapoints == 0)

In [82]:
needs_pct_pop_eating_vehicle = needs_pct_pop_eating_vehicle.loc[needs_pct_pop_eating_vehicle.needs_pct_pop_eating_vehicle==0,['location_name','vehicle']]

In [91]:
# eating_vehicle = eating_vehicle.merge(needs_pct_pop_eating_vehicle, on = ['location_name','vehicle'], how = 'right')
eating_vehicle.to_csv(save_dir + '/data_summary_percent_of_population_eating_vehicle.csv')

## pct vehicle that is industrially produced

In [92]:
vehicle_ind_prod = data_counts_all[data_counts_all.value_description=="percent of vehicle that is industrially produced"]

In [98]:
vehicle_ind_prod = vehicle_ind_prod.merge(eating_ind_prod.loc[eating_ind_prod.n_data_points==0,['location_name','vehicle']], 
                       on = ['location_name','vehicle'], 
                       how = 'right')

In [99]:
vehicle_ind_prod.to_csv(save_dir + '/data_summary_percent_of_vehicle_industrially_produced.csv')

## pct vehicle fortified

In [100]:
vehicle_fortified = data_counts_all[data_counts_all.value_description=="percent of vehicle that is fortified"]
# eating_fortified.to_csv(save_dir + '/data_summary_percent_of_population_eating_fortified_vehicle.csv')

In [106]:
vehicle_fortified = vehicle_fortified.merge(eating_fortified.loc[eating_fortified.n_data_points==0,['location_name','vehicle','nutrient']],
                                           on = ['location_name','vehicle','nutrient'],
                                           how = 'right')

In [108]:
vehicle_fortified.to_csv(save_dir + '/data_summary_percent_of_vehicle_fortified.csv')

In [109]:
vehicle_fortified.head()

Unnamed: 0,location_name,vehicle,value_description,nutrient,n_data_points,value_means
0,Angola,maize flour,percent of vehicle that is fortified,folic acid,0,[nan]
1,Angola,maize flour,percent of vehicle that is fortified,iron,0,[nan]
2,Angola,maize flour,percent of vehicle that is fortified,vitamin a,0,[nan]
3,Angola,maize flour,percent of vehicle that is fortified,vitamin b1,0,[nan]
4,Angola,maize flour,percent of vehicle that is fortified,vitamin b12,0,[nan]
