In [1]:
import pandas as pd, numpy as np

# LSFF: Output data counts under different binning scenarios

In [2]:
## read in data
path = '/ihme/homes/beatrixh/notebooks/viv_rsc/data_prep/outputs/lsff_data_stage0.csv'
df = pd.read_csv(path)

In [4]:
## for convenience
check_cols = ['location_id','location_name','vehicle','value_description','nutrient','value_mean','value_025_percentile',
       'value_975_percentile','notes']

In [5]:
## check that we have nutrients iff expected

In [6]:
df[df.nutrient=='na'].value_description.unique()

array(['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle'], dtype=object)

In [7]:
df[df.nutrient!='na'].value_description.unique()

array(['percent of population eating fortified vehicle',
       'percent of vehicle that is fortified'], dtype=object)

In [8]:
# relabel vals

#https://journals.sagepub.com/doi/pdf/10.1177/15648265120334S307
# cote divoire,
# "During the last year of the project, 206,410 MT of
# vegetable oil was marketed in Côte d’Ivoire. The fortified
# vegetable oil was covering 89% of the market."

val_descrip_map = {'percent of population eating fortified vehicle': 'percent of population eating fortified vehicle',
 'percent of vehicle that is industrially produced': 'percent of vehicle that is industrially produced',
 'percent of vehicle that is fortified': 'percent of vehicle that is fortified',
 'percent of population eating industrially produced vehicle': 'percent of population eating industrially produced vehicle',
 'percent of marketshare of fortified products': 'percent of market covered by fortified product',
 'percent of population eating vehicle': 'percent of population eating vehicle'}

df.value_description = df.value_description.map(val_descrip_map)

In [55]:
def output_frame(cols= ['location_name','vehicle','nutrient','value_description']):
    """
    INPUT: desired columns to bin lsff data on
    ---
    OUTPUT: all legal value combinations using input columns
    ---
    eg: (iodine/salt) or (china/maize flour) are illegal combinations
    """
    
    # load legal pairs
    data_prep_dir = '/ihme/scratch/users/beatrixh/vivarium_data_analysis/pre_processing/lsff_project/data_prep/'
    with open(data_prep_dir + 'lsff_vehicle_nutrient_pairs.pickle', 'rb') as handle:
        vehicle_nutrient_map = pickle.load(handle)

    with open(data_prep_dir + 'lsff_country_vehicle_pairs.pickle', 'rb') as handle:
        country_vehicle_map = pickle.load(handle)
        
    countries = [i for i in country_vehicle_map.keys() if type(i)==str]
    
    # value_descrips
    nutrient_relevant_vds = ['percent of population eating fortified vehicle',
           'percent of vehicle that is fortified',
           'percent of market covered by fortified product']
    nutrient_irrelevant_vds = ['percent of vehicle that is industrially produced',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']

    # all legal loc-vehicle-nutrient-val_ds for val_ds that have nutrient
    data_counts_a = pd.DataFrame([(i,j,k,l) for i in countries
                                  for j in country_vehicle_map[i]
                                  for k in vehicle_nutrient_map[j]
                                  for l in nutrient_relevant_vds],
                                 columns = ['location_name','vehicle','nutrient','value_description'])
    # all legal loc-vehicle-nutrient-val_ds for val_ds with nutrient='na'
    data_counts_b = pd.DataFrame([(i,j,'na',k) for i in countries
                                  for j in country_vehicle_map[i]
                                  for k in nutrient_irrelevant_vds],
                                 columns = ['location_name','vehicle','nutrient','value_description']) 
    
    data_counts = data_counts_a.append(data_counts_b).sort_values(by=['location_name','vehicle','value_description','nutrient'])
    
    return data_counts[cols].drop_duplicates().reset_index(drop=True)

In [56]:
def add_summary_measures(frame, data, n_points = True, mean_mean = True, mean_values = True):
    """
    INPUT:
    - df with extracted data containing a 'value_mean' column, and a set of id_cols
    - a frame whose cols == id_cols
    - desired summary measures for each unique combination of id_cols
    """
    
    # merge extracted data onto frame containing desired bins
    merge_cols = frame.columns.tolist()
    data_counts = frame.merge(data[merge_cols + ['value_mean']], on = merge_cols, how = 'left')
    output = frame.copy()
    
    # calculate summary measures
    if n_points:
        data_counts_n = data_counts.groupby(merge_cols).count().rename(columns={'value_mean':'n_data_points'})
        output = output.merge(data_counts_n, on = merge_cols, how = 'left')
    if mean_mean:
        data_counts_mean = data_counts.groupby(merge_cols).mean().rename(columns={'value_mean':'mean_mean'})
        output = output.merge(data_counts_mean, on = merge_cols, how = 'left')
    if mean_values:
        data_counts_detail = data_counts.groupby(merge_cols).aggregate(lambda x : x.tolist()).rename(columns={'value_mean':'value_means'})
        output = output.merge(data_counts_detail, on = merge_cols, how = 'left')
        
    return output

## count data coverage for ALL county - vehicle - val_des - nutrient combinations

In [52]:
data_counts = output_frame(cols = ['location_name','vehicle','nutrient','value_description'])

In [57]:
data_counts_full = add_summary_measures(frame = data_counts, data = df, n_points = True, mean_mean = True, mean_values = True)

In [58]:
data_counts_full.head()

Unnamed: 0,location_name,vehicle,nutrient,value_description,n_data_points,mean_mean,value_means
0,Angola,maize flour,folic acid,percent of market covered by fortified product,0,,[nan]
1,Angola,maize flour,iron,percent of market covered by fortified product,0,,[nan]
2,Angola,maize flour,vitamin a,percent of market covered by fortified product,0,,[nan]
3,Angola,maize flour,vitamin b1,percent of market covered by fortified product,0,,[nan]
4,Angola,maize flour,vitamin b12,percent of market covered by fortified product,0,,[nan]


In [59]:
data_counts_full_path = '/ihme/homes/beatrixh/notebooks/viv_rsc/new_lsff/outputs/data_counts_full.csv'
data_counts_full.to_csv(data_counts_full_path, index = False)

## count data coverage for target county - vehicle - val_des - nutrient combinations

In [61]:
target_value_descriptions = ['percent of population eating fortified vehicle',
       'percent of population eating industrially produced vehicle',
       'percent of population eating vehicle']

In [65]:
target_data_counts = data_counts_full[data_counts_full.value_description.isin(target_value_descriptions)]

In [75]:
target_data_counts_path = '/ihme/homes/beatrixh/notebooks/viv_rsc/new_lsff/outputs/data_counts_ideal_bins.csv'
target_data_counts.to_csv(target_data_counts_path, index = False)

## count data coverage for ALL county - vehicle - val_des combos, NOT nutrients

In [68]:
data_counts_nutrient_na = output_frame(cols = ['location_name','vehicle','value_description'])

In [70]:
data_counts_nutrient_na = add_summary_measures(frame = data_counts_nutrient_na, data = df, n_points = True, mean_mean = True, mean_values = True)

In [71]:
data_counts_nutrient_na

Unnamed: 0,location_name,vehicle,value_description,n_data_points,mean_mean,value_means
0,Angola,maize flour,percent of market covered by fortified product,0,,[nan]
1,Angola,maize flour,percent of population eating fortified vehicle,0,,[nan]
2,Angola,maize flour,percent of population eating industrially prod...,0,,[nan]
3,Angola,maize flour,percent of population eating vehicle,0,,[nan]
4,Angola,maize flour,percent of vehicle that is fortified,0,,[nan]
...,...,...,...,...,...,...
475,Vietnam,wheat flour,percent of population eating fortified vehicle,0,,[nan]
476,Vietnam,wheat flour,percent of population eating industrially prod...,0,,[nan]
477,Vietnam,wheat flour,percent of population eating vehicle,10,21.93,"[4.2, 38.8, 46.7, 31.1, 16.4, 17.5, 9.6, 22.7,..."
478,Vietnam,wheat flour,percent of vehicle that is fortified,6,0.00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [73]:
data_counts_nutrient_na_path = '/ihme/homes/beatrixh/notebooks/viv_rsc/new_lsff/outputs/data_counts_nutrient_na.csv'
data_counts_nutrient_na.to_csv(data_counts_nutrient_na_path, index = False)