In [1]:
import pandas as pd, numpy as np

In [2]:
import pickle

# LSFF: Output data counts under different binning scenarios

In [3]:
## read in data
path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_data_stage0_3_8_2021.csv'
df = pd.read_csv(path)

In [4]:
## for convenience
check_cols = ['location_id','location_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','notes']

In [5]:
## check that we have nutrients iff expected

In [6]:
df[df.nutrient=='na'].value_description.unique()

array(['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle'], dtype=object)

In [7]:
df[df.nutrient!='na'].value_description.unique()

array(['percent of population eating fortified vehicle',
       'percent of vehicle that is fortified'], dtype=object)

In [8]:
# relabel vals

#https://journals.sagepub.com/doi/pdf/10.1177/15648265120334S307
# cote divoire,
# "During the last year of the project, 206,410 MT of
# vegetable oil was marketed in Côte d’Ivoire. The fortified
# vegetable oil was covering 89% of the market."

val_descrip_map = {'percent of population eating fortified vehicle': 'percent of population eating fortified vehicle',
 'percent of vehicle that is industrially produced': 'percent of vehicle that is industrially produced',
 'percent of vehicle that is fortified': 'percent of vehicle that is fortified',
 'percent of population eating industrially produced vehicle': 'percent of population eating industrially produced vehicle',
 'percent of marketshare of fortified products': 'percent of market covered by fortified product',
 'percent of population eating vehicle': 'percent of population eating vehicle'}

df.value_description = df.value_description.map(val_descrip_map)

In [69]:
def output_frame(cols= ['location_name','vehicle','nutrient','value_description']):
    """
    INPUT: desired columns to bin lsff data on
    ---
    OUTPUT: all legal value combinations using input columns
    ---
    eg: (iodine/salt) or (china/maize flour) are illegal combinations
    """
    
    # load legal pairs
    data_prep_dir = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/inputs/'
    with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
        vehicle_nutrient_map = pickle.load(handle)

    with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
        country_vehicle_map = pickle.load(handle)
        
    countries = [i for i in country_vehicle_map.keys() if type(i)==str]
    
    # value_descrips
    nutrient_relevant_vds = ['percent of population eating fortified vehicle',
           'percent of vehicle that is fortified',
           'percent of market covered by fortified product']
    nutrient_irrelevant_vds = ['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']

    estimate_status = [0,1]
    
    # all legal loc-vehicle-nutrient-val_ds for val_ds that have nutrient
    data_counts_a = pd.DataFrame([(i,j,k,l) for i in countries
                                  for j in country_vehicle_map[i]
                                  for k in vehicle_nutrient_map[j]
                                  for l in nutrient_relevant_vds],
                                 columns = ['location_name','vehicle','nutrient','value_description'])
    # all legal loc-vehicle-nutrient-val_ds for val_ds with nutrient='na'
    data_counts_b = pd.DataFrame([(i,j,'na',k) for i in countries
                                  for j in country_vehicle_map[i]
                                  for k in nutrient_irrelevant_vds],
                                 columns = ['location_name','vehicle','nutrient','value_description']) 
    
    data_counts = data_counts_a.append(data_counts_b).sort_values(by=['location_name','vehicle','value_description','nutrient'])
    
    return data_counts[cols].drop_duplicates().reset_index(drop=True)

In [20]:
def add_summary_measures_helper(frame, data, n_points = True, mean_mean = True, mean_values = True):
    """
    INPUT:
    - df with extracted data containing a 'value_mean' column, and a set of id_cols
    - a frame whose cols == id_cols
    - desired summary measures for each unique combination of id_cols
    """
    
    # merge extracted data onto frame containing desired bins
    merge_cols = frame.columns.tolist()
    data_counts = frame.merge(data[merge_cols + ['value_mean']], on = merge_cols, how = 'left')
    output = frame.copy()
    
    # calculate summary measures
    if n_points:
        data_counts_n = data_counts.groupby(merge_cols).count().rename(columns={'value_mean':'n_data_points'})
        output = output.merge(data_counts_n, on = merge_cols, how = 'left')
    if mean_mean:
        data_counts_mean = data_counts.groupby(merge_cols).mean().rename(columns={'value_mean':'mean_mean'})
        output = output.merge(data_counts_mean, on = merge_cols, how = 'left')
    if mean_values:
        data_counts_detail = data_counts.groupby(merge_cols).aggregate(lambda x : x.tolist()).rename(columns={'value_mean':'value_means'})
        output = output.merge(data_counts_detail, on = merge_cols, how = 'left')
        
    return output

In [136]:
def add_summary_measures_handling_estimates(frame, data, n_points = True, mean_mean = True, mean_values = True):
    
    #helpers
    merge_cols = frame.columns.tolist()
    return_cols = merge_cols
    
    #split out into extracted data vs assumptions
    extracted = add_summary_measures_helper(frame, df[df.is_estimate!=1], True, mean_mean, mean_values)
    assm = add_summary_measures_helper(frame, df[df.is_estimate==1], True, mean_mean, mean_values)
    
    #wide on is_estimated status
    data = extracted.merge(assm, on = merge_cols, suffixes = ['','_assm'], how = 'outer')
    assert(len(data)==len(extracted)), "merge issue"
    assert(len(data)==len(assm)), "merge issue"
    
    #rows for which we have both extracted and assumed "data"
    both = data[(data.n_data_points > 0) & (data.n_data_points_assm > 0)]
    if len(both)>0:
        print("(!) Make sure you are okay throwing out datapoint estimates for all rows for which we also have extracted data")
    
    #we will throw out the assumed data, and return this df to double check
    data.loc[(data.n_data_points > 0) & (data.n_data_points_assm > 0),'n_data_points_assum'] = 0
    
    
    #for all assumed data where we dont have extracted data, keep and mark as such
    data.loc[(data.n_data_points == 0) & (data.n_data_points_assm > 0),'is_estimate'] = 1
    data.loc[(data.is_estimate==1),'n_data_points'] = data.n_data_points_assm
    if n_points:
        return_cols = return_cols + ['n_data_points']
    if mean_mean:
        data.loc[(data.is_estimate==1),'mean_mean'] = data.mean_mean_assm
        return_cols = return_cols + ['mean_mean']
    if mean_values:
        data.loc[(data.is_estimate==1),'value_means'] = data.value_means_assm
        return_cols = return_cols + ['value_means']
    
    data.loc[data.is_estimate!=1,'is_estimate'] = 0
    data.is_estimate = data.is_estimate.astype(int)
        
    return data[return_cols + ['is_estimate']], both

## count data coverage for ALL county - vehicle - val_des - nutrient combinations

In [150]:
data_counts = output_frame(cols = ['location_name','vehicle','nutrient','value_description'])

In [151]:
data_counts_full, check = add_summary_measures_handling_estimates(frame = data_counts, data = df, n_points = True, mean_mean = True, mean_values = True)

(!) Make sure you are okay throwing out datapoint estimates for all rows for which we also have extracted data


In [152]:
#going to drop these vals where we assumed zero
check

Unnamed: 0,location_name,vehicle,nutrient,value_description,n_data_points,mean_mean,value_means,n_data_points_assm,mean_mean_assm,value_means_assm
458,Ethiopia,salt,folic acid,percent of population eating fortified vehicle,1,0.0,[0.0],1,0.0,[0.0]
600,India,salt,iron,percent of population eating fortified vehicle,2,92.7,"[92.7, 92.7]",1,0.0,[0.0]


In [153]:
data_counts_full_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/data_counts_full_3_10_2021.csv'
data_counts_full.to_csv(data_counts_full_path, index = False)

## count data coverage for target county - vehicle - val_des - nutrient combinations

In [158]:
target_value_descriptions = ['percent of population eating fortified vehicle',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']

In [159]:
target_data_counts = data_counts_full[data_counts_full.value_description.isin(target_value_descriptions)]

In [160]:
target_data_counts

Unnamed: 0,location_name,vehicle,nutrient,value_description,n_data_points,mean_mean,value_means,is_estimate
6,Angola,maize flour,folic acid,percent of population eating fortified vehicle,0,,[nan],0
7,Angola,maize flour,iron,percent of population eating fortified vehicle,0,,[nan],0
8,Angola,maize flour,vitamin a,percent of population eating fortified vehicle,0,,[nan],0
9,Angola,maize flour,vitamin b1,percent of population eating fortified vehicle,0,,[nan],0
10,Angola,maize flour,vitamin b12,percent of population eating fortified vehicle,0,,[nan],0
...,...,...,...,...,...,...,...,...
1281,Vietnam,wheat flour,vitamin b1,percent of population eating fortified vehicle,0,,[nan],0
1282,Vietnam,wheat flour,vitamin b12,percent of population eating fortified vehicle,0,,[nan],0
1283,Vietnam,wheat flour,zinc,percent of population eating fortified vehicle,0,,[nan],0
1284,Vietnam,wheat flour,na,percent of population eating industrially prod...,0,,[nan],0


In [161]:
target_data_counts_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/data_counts_ideal_bins_3_10_2021.csv'
target_data_counts.to_csv(target_data_counts_path, index = False)

## count data coverage for ALL county - vehicle - val_des combos, NOT nutrients

In [165]:
data_counts_nutrient_na = output_frame(cols = ['location_name','vehicle','value_description'])

In [163]:
data_counts_nutrient_na = add_summary_measures(frame = data_counts_nutrient_na, data = df, n_points = True, mean_mean = True, mean_values = True)

In [166]:
data_counts_nutrient_na, check_b = add_summary_measures_handling_estimates(data_counts_nutrient_na, df, n_points = True, mean_mean = True, mean_values = True)

(!) Make sure you are okay throwing out datapoint estimates for all rows for which we also have extracted data


In [167]:
data_counts_nutrient_na

Unnamed: 0,location_name,vehicle,value_description,n_data_points,mean_mean,value_means,is_estimate
0,Angola,maize flour,percent of market covered by fortified product,0,,[nan],0
1,Angola,maize flour,percent of population eating fortified vehicle,0,,[nan],0
2,Angola,maize flour,percent of population eating industrially prod...,0,,[nan],0
3,Angola,maize flour,percent of population eating vehicle,0,,[nan],0
4,Angola,maize flour,percent of vehicle that is fortified,0,,[nan],0
...,...,...,...,...,...,...,...
475,Vietnam,wheat flour,percent of population eating fortified vehicle,0,,[nan],0
476,Vietnam,wheat flour,percent of population eating industrially prod...,0,,[nan],0
477,Vietnam,wheat flour,percent of population eating vehicle,10,21.930000,"[4.2, 38.8, 46.7, 31.1, 16.4, 17.5, 9.6, 22.7,...",0
478,Vietnam,wheat flour,percent of vehicle that is fortified,7,5.357143,"[0.0, 37.5, 0.0, 0.0, 0.0, 0.0, 0.0]",0


In [168]:
check_b

Unnamed: 0,location_name,vehicle,value_description,n_data_points,mean_mean,value_means,n_data_points_assm,mean_mean_assm,value_means_assm
169,Ethiopia,salt,percent of population eating fortified vehicle,2,42.8,"[85.6, 0.0]",1,0.0,[0.0]
223,India,salt,percent of population eating fortified vehicle,2,92.7,"[92.7, 92.7]",1,0.0,[0.0]
229,India,wheat flour,percent of population eating fortified vehicle,3,6.3,"[6.3, 6.3, 6.3]",1,0.0,[0.0]


In [169]:
data_counts_nutrient_na_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/data_counts_nutrient_na_3_10_2021.csv'
data_counts_nutrient_na.to_csv(data_counts_nutrient_na_path, index = False)