In [1]:
import pandas as pd, numpy as np

# Compile regression estimates / examine for logical consistency

In [2]:
w_metadata_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/04_27_lsff_input_coverage_data_with_metadata.csv'
df = pd.read_csv(w_metadata_path)

In [3]:
df.estimation_status.unique()

array(['regression', 'na', 'multiplicative', 'lack of evidence'],
      dtype=object)

In [4]:
df[(df.value_mean.isna())]

Unnamed: 0,location_id,location_name,sub_population,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,u5_applicable,wra_applicable,estimation_status,source_citation,source_link,CI_source,data_choice_notes


In [5]:
df[df.value_025_percentile.isna()].location_name.unique()

array(['Angola', 'China', 'Egypt', 'Ghana', 'Madagascar', 'Niger',
       'Sudan'], dtype=object)

In [6]:
A = df[(df.value_description=="percent of population eating vehicle")].rename(columns={'value_mean':"A",'estimation_status':'A_estimation_status'})
B = df[(df.value_description=="percent of population eating industrially produced vehicle")].rename(columns={'value_mean':"B",'estimation_status':'B_estimation_status'})
C = df[(df.value_description=="percent of population eating fortified vehicle")].rename(columns={'value_mean':"C",'estimation_status':'C_estimation_status'})

A = A.drop(columns=['value_description','nutrient'])
B = B.drop(columns=['value_description','nutrient'])
C = C.drop(columns=['value_description'])

ref = A.merge(B, on = ['location_id','location_name','vehicle'], how = 'left').merge(C, on = ['location_id','location_name','vehicle'], how = 'outer')[['location_name','vehicle','nutrient','A','A_estimation_status','B','B_estimation_status','C','C_estimation_status']]


In [7]:
ref

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
0,Ethiopia,maize flour,folic acid,59.259825,regression,27.169813,regression,0.000000,regression
1,Ethiopia,maize flour,iron,59.259825,regression,27.169813,regression,0.000000,regression
2,Ethiopia,maize flour,vitamin a,59.259825,regression,27.169813,regression,0.000000,regression
3,Ethiopia,maize flour,zinc,59.259825,regression,27.169813,regression,0.000000,regression
4,Ethiopia,oil,vitamin a,29.350000,regression,70.000000,na,0.000000,regression
...,...,...,...,...,...,...,...,...,...
211,Sudan,oil,vitamin a,50.646725,regression,80.000000,na,80.000000,na
212,Sudan,wheat flour,folic acid,88.758036,regression,35.365713,regression,1.760469,regression
213,Sudan,wheat flour,iron,88.758036,regression,35.365713,regression,1.760469,regression
214,Sudan,wheat flour,vitamin a,88.758036,regression,35.365713,regression,1.760469,regression


In [8]:
# problem rows (less fortifiable than is fortified)
ref[(ref.B < ref.C)]

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
73,Burkina Faso,oil,vitamin a,92.23,na,51.975594,multiplicative,75.0,na
87,Kenya,wheat flour,folic acid,36.58,na,30.95,na,34.703333,multiplicative
88,Kenya,wheat flour,iron,36.58,na,30.95,na,33.83575,multiplicative
89,Kenya,wheat flour,vitamin a,36.58,na,30.95,na,34.703333,multiplicative
90,Kenya,wheat flour,zinc,36.58,na,30.95,na,34.703333,multiplicative
124,Côte d'Ivoire,maize flour,folic acid,84.907643,regression,5.387876,regression,22.793768,regression
125,Côte d'Ivoire,maize flour,iron,84.907643,regression,5.387876,regression,22.793768,regression
126,Côte d'Ivoire,maize flour,vitamin a,84.907643,regression,5.387876,regression,22.793768,regression
127,Côte d'Ivoire,maize flour,zinc,84.907643,regression,5.387876,regression,22.793768,regression
129,Côte d'Ivoire,wheat flour,folic acid,54.7,na,10.2,na,10.599703,regression


In [9]:
ref[(ref.B.isna()) | (ref.C.isna())]

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status


In [10]:
ref.location_name.nunique()

25

In [11]:
oil_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_oil_regression_estimates_3_23_2021.csv'
wheat_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_wheat_regression_estimates_3_22_2021.csv'
maize_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_maize_regression_estimates_3_22_2021.csv'

ip_wheat_path =  '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_wheat_regression_estimates_3_31_2021.csv'
ip_oil_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_oil_regression_estimates_3_31_2021.csv'
ip_maize_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_maize_regression_estimates_3_31_2021.csv'

fort_wheat_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_wheat_regression_estimates_3_31_2021.csv'
fort_maize_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_maize_regression_estimates_3_31_2021.csv'
fort_oil_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_oil_regression_estimates_3_31_2021.csv'

In [12]:
paths_A = [oil_est_path,wheat_est_path,maize_est_path]
paths_B = [ip_wheat_path,ip_oil_path,ip_maize_path]
paths_C = [fort_wheat_path, fort_maize_path, fort_oil_path]

def format_data(paths, var, n):
    draws = [f'draw_{i}' for i in range(n)]
    df = pd.concat([pd.read_csv(path) for path in paths])
    df[var] = df[draws].mean(axis=1)
    df = df.drop(columns=draws)
    
    return df.groupby(['location_name','vehicle']).mean().reset_index()

In [13]:
scale_over_mean_map = {
    "maize flour":0.578996, 
    "oil":0.069702,
    "wheat flour":0.507994
}

In [14]:
df['scale_over_mean'] = 2 * df.vehicle.map(scale_over_mean_map)

In [15]:
eats = format_data(paths_A, 'reg_A', n = 1_000)
ip = format_data(paths_B, 'reg_B', n = 500)
fort = format_data(paths_C, 'reg_C',n = 500)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [16]:
eats.head()

Unnamed: 0,location_name,vehicle,reg_A
0,Angola,maize flour,51.100278
1,Angola,oil,90.384708
2,Angola,wheat flour,27.17197
3,Bangladesh,oil,79.2
4,Bangladesh,wheat flour,62.233333


In [17]:
df = df.merge(eats, on = ['location_name','vehicle'], how = 'outer')

In [18]:
df['updated'] = False

In [19]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'value_mean'] = df.reg_A

In [20]:
ip.head()

Unnamed: 0,location_name,vehicle,reg_B
0,Angola,maize flour,9.158853
1,Angola,oil,39.102709
2,Angola,wheat flour,26.261094
3,Bangladesh,oil,87.95
4,Bangladesh,wheat flour,38.241333


In [21]:
df = df.merge(ip, on = ['location_name','vehicle'], how = 'outer')

In [22]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_B.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_B.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_B.notna()),'value_mean'] = df.reg_B

In [23]:
fort.head()

Unnamed: 0,location_name,vehicle,reg_C
0,Angola,maize flour,9.191133
1,Angola,oil,8.805705
2,Angola,wheat flour,16.888381
3,Bangladesh,oil,12.811708
4,Bangladesh,wheat flour,0.0


In [24]:
df = df.merge(fort, on = ['location_name','vehicle'], how = 'outer')

In [25]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_C.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_B.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_B.notna()),'value_mean'] = df.reg_C

In [26]:
df[(df.updated)].estimation_status.unique()

array(['regression'], dtype=object)

In [27]:
df['lower'] = np.clip(df.value_mean - (df.scale_over_mean * df.value_mean)/2, 0, 100)
df['upper'] = np.clip(df.value_mean + (df.scale_over_mean * df.value_mean)/2, 0, 100)

In [28]:
df.loc[(df.updated),'value_025_percentile'] = df.lower
df.loc[(df.updated),'value_975_percentile'] = df.upper

In [29]:
df.loc[(df.value_025_percentile.isna()),'value_025_percentile'] = df.lower
df.loc[(df.value_975_percentile.isna()),'value_975_percentile'] = df.upper

In [30]:
A = df[(df.value_description=="percent of population eating vehicle")].rename(
    columns={
        'value_mean':"A",
        'estimation_status':'A_estimation_status',
        'value_025_percentile':'A_lower',
        'value_975_percentile':'A_upper'
    })
B = df[(df.value_description=="percent of population eating industrially produced vehicle")].rename(
    columns={
        'value_mean':"B",
        'estimation_status':'B_estimation_status',
        'value_025_percentile':'B_lower',
        'value_975_percentile':'B_upper'
    })

C = df[(df.value_description=="percent of population eating fortified vehicle")].rename(
    columns={
        'value_mean':"C",
        'estimation_status':'C_estimation_status',
        'value_025_percentile':'C_lower',
        'value_975_percentile':'C_upper'
    })

In [31]:
# B = df[(df.value_description=="percent of population eating industrially produced vehicle")].rename(columns={'value_mean':"B",'estimation_status':'B_estimation_status'})
# C = df[(df.value_description=="percent of population eating fortified vehicle")].rename(columns={'value_mean':"C",'estimation_status':'C_estimation_status'})

A = A.drop(columns=['value_description','nutrient'])
B = B.drop(columns=['value_description','nutrient'])
C = C.drop(columns=['value_description'])

ref = A.merge(B, on = ['location_id','location_name','vehicle'], how = 'left').merge(C, on = ['location_id','location_name','vehicle'], how = 'outer') #[['location_name','vehicle','nutrient','A','A_estimation_status','B','B_estimation_status','C','C_estimation_status']]

In [32]:
ref = ref[['location_name','vehicle','nutrient',
           'A','A_lower','A_upper','A_estimation_status',
           'B','B_lower','B_upper','B_estimation_status',
           'C','C_lower','C_upper','C_estimation_status']]

In [33]:
ref

Unnamed: 0,location_name,vehicle,nutrient,A,A_lower,A_upper,A_estimation_status,B,B_lower,B_upper,B_estimation_status,C,C_lower,C_upper,C_estimation_status
0,Ethiopia,maize flour,folic acid,59.263848,24.950317,93.577380,regression,27.167514,11.437632,42.897396,regression,0.000000,0.000000,0.000000,regression
1,Ethiopia,maize flour,iron,59.263848,24.950317,93.577380,regression,27.167514,11.437632,42.897396,regression,0.000000,0.000000,0.000000,regression
2,Ethiopia,maize flour,vitamin a,59.263848,24.950317,93.577380,regression,27.167514,11.437632,42.897396,regression,0.000000,0.000000,0.000000,regression
3,Ethiopia,maize flour,zinc,59.263848,24.950317,93.577380,regression,27.167514,11.437632,42.897396,regression,0.000000,0.000000,0.000000,regression
4,Ethiopia,oil,vitamin a,29.350000,27.304246,31.395754,regression,70.000000,64.070691,75.929309,na,0.000000,0.000000,0.000000,regression
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,Sudan,oil,vitamin a,50.646725,47.116547,54.176903,regression,80.000000,74.423840,85.576160,na,80.000000,74.423840,85.576160,na
212,Sudan,wheat flour,folic acid,88.823684,43.701786,100.000000,regression,57.902255,28.488257,87.316254,regression,6.525113,3.210395,9.839831,regression
213,Sudan,wheat flour,iron,88.823684,43.701786,100.000000,regression,57.902255,28.488257,87.316254,regression,6.525113,3.210395,9.839831,regression
214,Sudan,wheat flour,vitamin a,88.823684,43.701786,100.000000,regression,57.902255,28.488257,87.316254,regression,6.525113,3.210395,9.839831,regression


In [34]:
ref[(ref.B < ref.C)][['location_name','A','B','C','B_upper','C_lower']]

Unnamed: 0,location_name,A,B,C,B_upper,C_lower
73,Burkina Faso,92.23,51.975594,75.0,77.976411,37.481212
87,Kenya,36.58,30.95,34.703333,48.296257,15.25348
88,Kenya,36.58,30.95,33.83575,48.296257,14.872143
89,Kenya,36.58,30.95,34.703333,48.296257,15.25348
90,Kenya,36.58,30.95,34.703333,48.296257,15.25348
130,Côte d'Ivoire,54.7,10.2,13.4015,13.1,9.082841
143,Indonesia,75.761649,29.655911,73.1,44.720935,49.543388
144,Indonesia,75.761649,29.655911,73.1,44.720935,49.543388
146,Indonesia,75.761649,29.655911,73.1,44.720935,49.543388
156,Angola,51.100278,9.158853,9.191133,14.461792,3.869504


In [35]:
ref[(ref.B.isna()) | (ref.C.isna())][['location_name','vehicle','nutrient','A_estimation_status','A','B_estimation_status','B','C_estimation_status','C']]

Unnamed: 0,location_name,vehicle,nutrient,A_estimation_status,A,B_estimation_status,B,C_estimation_status,C
145,Indonesia,wheat flour,vitamin a,regression,75.761649,regression,29.655911,regression,
165,China,oil,vitamin a,regression,57.651373,regression,63.577005,regression,
174,Egypt,oil,vitamin a,regression,54.973297,regression,70.101964,regression,


In [36]:
ref[(ref.B_upper < ref.C_lower)][['location_name','vehicle','nutrient','A_estimation_status','A','B_estimation_status','B','C_estimation_status','C']]

Unnamed: 0,location_name,vehicle,nutrient,A_estimation_status,A,B_estimation_status,B,C_estimation_status,C
143,Indonesia,wheat flour,folic acid,regression,75.761649,regression,29.655911,na,73.1
144,Indonesia,wheat flour,iron,regression,75.761649,regression,29.655911,na,73.1
146,Indonesia,wheat flour,zinc,regression,75.761649,regression,29.655911,na,73.1


In [37]:
# these are the locations rejection sampling should be able to fix
sub = ref.loc[(ref.B < ref.C) & ~(ref.B_upper < ref.C_lower)]

In [38]:
sub

Unnamed: 0,location_name,vehicle,nutrient,A,A_lower,A_upper,A_estimation_status,B,B_lower,B_upper,B_estimation_status,C,C_lower,C_upper,C_estimation_status
73,Burkina Faso,oil,vitamin a,92.23,46.091896,100.0,na,51.975594,25.974777,77.976411,multiplicative,75.0,37.481212,100.0,na
87,Kenya,wheat flour,folic acid,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative
88,Kenya,wheat flour,iron,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,33.83575,14.872143,52.799357,multiplicative
89,Kenya,wheat flour,vitamin a,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative
90,Kenya,wheat flour,zinc,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative
130,Côte d'Ivoire,wheat flour,iron,54.7,50.1,59.6,na,10.2,7.5,13.1,na,13.4015,9.082841,17.720159,multiplicative
156,Angola,maize flour,folic acid,51.100278,21.513422,80.687135,regression,9.158853,3.855914,14.461792,regression,9.191133,3.869504,14.512762,regression
157,Angola,maize flour,iron,51.100278,21.513422,80.687135,regression,9.158853,3.855914,14.461792,regression,9.191133,3.869504,14.512762,regression
158,Angola,maize flour,vitamin a,51.100278,21.513422,80.687135,regression,9.158853,3.855914,14.461792,regression,9.191133,3.869504,14.512762,regression
159,Angola,maize flour,zinc,51.100278,21.513422,80.687135,regression,9.158853,3.855914,14.461792,regression,9.191133,3.869504,14.512762,regression


In [39]:
fortifiable = sub[['location_name','vehicle','nutrient','B','B_lower','B_upper','B_estimation_status']]
fortifiable['value_description'] = 'B'

fortified = sub[['location_name','vehicle','nutrient','C','C_lower','C_upper','C_estimation_status']]
fortified['value_description'] = 'C'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [40]:
import scipy.stats
import scipy.integrate as integrate

In [41]:
sub.head()

Unnamed: 0,location_name,vehicle,nutrient,A,A_lower,A_upper,A_estimation_status,B,B_lower,B_upper,B_estimation_status,C,C_lower,C_upper,C_estimation_status
73,Burkina Faso,oil,vitamin a,92.23,46.091896,100.0,na,51.975594,25.974777,77.976411,multiplicative,75.0,37.481212,100.0,na
87,Kenya,wheat flour,folic acid,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative
88,Kenya,wheat flour,iron,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,33.83575,14.872143,52.799357,multiplicative
89,Kenya,wheat flour,vitamin a,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative
90,Kenya,wheat flour,zinc,36.58,16.078349,57.081651,na,30.95,13.603743,48.296257,na,34.703333,15.25348,54.153186,multiplicative


In [42]:
replacement = pd.DataFrame()
for i, row in ref.loc[(ref.B < ref.C) & ~(ref.B_upper < ref.C_lower)].iterrows():
    std_b = np.max([(row.B_upper - row.B),(row.B - row.B_lower)]) / 1.96
    b_a = (0 - row.B) / std_b
    b_b = (100 - row.B) / std_b
    
    b_draws = scipy.stats.truncnorm.rvs(b_a, b_b, row.B, std_b, size = 500_000).tolist()
    
    std_c = np.max([(row.C_upper - row.C),(row.C - row.C_lower)]) / 1.96
    c_a = (0 - row.C) / std_c
    c_b = (100 - row.C) / std_c
    
    c_draws = scipy.stats.truncnorm.rvs(c_a, c_b, row.C, std_c, size = 500_000).tolist()
    
    all_draws = pd.DataFrame({
    'b':b_draws,
    'c':c_draws
    })

    all_draws = all_draws[all_draws.b > all_draws.c].T #only keep logical draws
    means = all_draws.mean(axis=1)
    lowers = all_draws.quantile(.025, axis=1)
    uppers = all_draws.quantile(.975, axis=1)
    
    
    new = pd.DataFrame({
    'location_name': [row.location_name] * 2,
    'vehicle': [row.vehicle] * 2,
    'nutrient': ['na',row.nutrient],
    'value_description':['percent of population eating industrially produced vehicle','percent of population eating fortified vehicle'],
    'value_mean':means,
    'value_025_percentile':lowers,
    'value_975_percentile':uppers,
    'estimation_status': [" + ".join([row.B_estimation_status,"rejection sampling"])," + ".join([row.C_estimation_status,"rejection sampling"])]
    })
    
    replacement = replacement.append(new)

In [43]:
replacement = replacement.groupby(['location_name','vehicle','value_description','nutrient','estimation_status']).mean().reset_index()

In [44]:
rename_repl = {
    'value_mean': 'new_value_mean',
    'value_025_percentile': 'new_value_025_percentile',
    'value_975_percentile': 'new_value_975_percentile',
    'estimation_status':'new_estimation_status'
}

replacement = replacement.rename(columns=rename_repl)

In [45]:
replacement

Unnamed: 0,location_name,vehicle,value_description,nutrient,new_estimation_status,new_value_mean,new_value_025_percentile,new_value_975_percentile
0,Angola,maize flour,percent of population eating fortified vehicle,folic acid,regression + rejection sampling,7.65849,3.142445,11.893078
1,Angola,maize flour,percent of population eating fortified vehicle,iron,regression + rejection sampling,7.658707,3.126008,11.900065
2,Angola,maize flour,percent of population eating fortified vehicle,vitamin a,regression + rejection sampling,7.651943,3.164621,11.878758
3,Angola,maize flour,percent of population eating fortified vehicle,zinc,regression + rejection sampling,7.649797,3.101712,11.88093
4,Angola,maize flour,percent of population eating industrially prod...,na,regression + rejection sampling,10.69286,6.464823,15.230748
5,Burkina Faso,oil,percent of population eating fortified vehicle,vitamin a,na + rejection sampling,51.122017,24.448594,75.432995
6,Burkina Faso,oil,percent of population eating industrially prod...,na,multiplicative + rejection sampling,63.417476,41.247528,85.805233
7,Côte d'Ivoire,wheat flour,percent of population eating fortified vehicle,iron,multiplicative + rejection sampling,10.307836,7.337757,12.991637
8,Côte d'Ivoire,wheat flour,percent of population eating industrially prod...,na,na + rejection sampling,11.597039,9.091318,14.108237
9,Kenya,wheat flour,percent of population eating fortified vehicle,folic acid,multiplicative + rejection sampling,27.417595,11.588142,42.076972


In [46]:
replacement['new_value_025_percentile'] = np.clip(replacement.new_value_025_percentile - 0.75*(replacement.new_value_mean - replacement.new_value_025_percentile), 0, 100)
replacement['new_value_975_percentile'] = np.clip(replacement.new_value_975_percentile + 0.75*(replacement.new_value_975_percentile - replacement.new_value_mean), 0, 100)

In [47]:
output = df.merge(replacement, on = ['location_name','vehicle','value_description','nutrient'], how = 'left')

In [48]:
output.loc[(output.new_value_mean.notna()),'value_mean'] = output.new_value_mean
output.loc[(output.new_value_mean.notna()),'value_025_percentile'] = output.new_value_025_percentile
output.loc[(output.new_value_mean.notna()),'value_975_percentile'] = output.new_value_975_percentile

In [49]:
output.loc[(output.new_value_mean.notna()),'estimation_status'] = output.new_estimation_status

In [50]:
final_cols = ['location_id', 'location_name', 'sub_population', 'vehicle',
       'value_description', 'nutrient', 'value_mean', 'value_025_percentile',
       'value_975_percentile']

In [51]:
all_inputted_data_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_input_coverage_data.csv'

In [52]:
output[output.value_mean.isna()]

Unnamed: 0,location_id,location_name,sub_population,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,u5_applicable,...,reg_A,updated,reg_B,reg_C,lower,upper,new_estimation_status,new_value_mean,new_value_025_percentile,new_value_975_percentile
218,11,Indonesia,,wheat flour,percent of population eating fortified vehicle,vitamin a,,25.986776,100.0,True,...,75.761649,False,29.655911,,,,,,,
252,6,China,,oil,percent of population eating fortified vehicle,vitamin a,,,,True,...,57.651373,False,63.577005,,,,,,,
267,141,Egypt,,oil,percent of population eating fortified vehicle,vitamin a,,,,True,...,54.973297,False,70.101964,,,,,,,


In [53]:
## checkout if anything missing citations

In [54]:
pd.set_option('display.max_rows', 500)
output[['estimation_status','source_citation','source_link','CI_source','data_choice_notes']].drop_duplicates().sort_values('estimation_status')

Unnamed: 0,estimation_status,source_citation,source_link,CI_source,data_choice_notes
56,lack of evidence,,na,extraction,
54,multiplicative,,na,modeling,
123,multiplicative,na,na,modeling,
41,multiplicative,,,modeling,
132,multiplicative + rejection sampling,na,na,modeling,
112,multiplicative + rejection sampling,,,modeling,
226,na,GFDX,https://fortificationdata.org/country-fortific...,modeling,Keeping GFDx value (95.2%) for total populatio...
114,na,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,modeling,Only one source; discarding the urban- and rur...
121,na,"Ferguson, Elaine et al. “Zinc, iron and calciu...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,modeling,Excluded central bureau of stats total pop num...
122,na,"Central Bureau of Statistics (Kenya), UK Depar...",NID 7375,modeling,Only one source.


In [55]:
output.loc[(output.estimation_status=="na"),['value_mean','source_citation','source_link','CI_source','data_choice_notes']].drop_duplicates()

Unnamed: 0,value_mean,source_citation,source_link,CI_source,data_choice_notes
7,70.0,"Hafebo AS, Ndao PBL, Wuehler S, et al. Overvie...",http://www.journalejnfs.com/index.php/EJNFS/ar...,modeling,kept most recent year / representative populat...
8,50.0,"Hafebo AS, Ndao PBL, Wuehler S, et al. Overvie...",http://www.journalejnfs.com/index.php/EJNFS/ar...,modeling,kept most recent year / representative populat...
11,27.0,"Hafebo AS, Ndao PBL, Wuehler S, et al. Overvie...",http://www.journalejnfs.com/index.php/EJNFS/ar...,modeling,Excluding 2000 overnment of the Federal Democr...
12,20.0,"Hafebo AS, Ndao PBL, Wuehler S, et al. Overvie...",http://www.journalejnfs.com/index.php/EJNFS/ar...,modeling,Excluding 2000 overnment of the Federal Democr...
17,99.999,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...
18,89.4,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...
19,24.3,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...
20,83.2,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...
21,7.1,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...
22,6.3,"Grant J Aaron, Valerie M Friesen, Svenja Jungj...",https://doi.org/10.3945/jn.116.245753,extraction,Only one source. Note source had two numbers: ...


In [56]:
output.loc[
    (output.estimation_status=="na") &
    (output.source_citation.isna()) & 
    (output.source_link.isna()) & 
    (output.data_choice_notes.isna()),
]

Unnamed: 0,location_id,location_name,sub_population,vehicle,value_description,nutrient,value_mean,value_025_percentile,value_975_percentile,u5_applicable,...,reg_A,updated,reg_B,reg_C,lower,upper,new_estimation_status,new_value_mean,new_value_025_percentile,new_value_975_percentile


In [57]:
temp_save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/427b_lsff_input_coverage_data.csv'
output[final_cols + ['u5_applicable','wra_applicable']].to_csv(temp_save_path, index = False)

In [63]:
wmeta_cols = ['location_id', 'location_name', 'sub_population', 'vehicle',
       'value_description', 'nutrient', 'value_mean', 'value_025_percentile',
       'value_975_percentile', 'u5_applicable', 'wra_applicable',
       'estimation_status', 'source_citation', 'source_link', 'CI_source',
       'data_choice_notes']

wmeta_save_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/04_27_lsff_input_coverage_data_with_metadata.csv'
output[wmeta_cols].to_csv(wmeta_save_path, index = False)

In [62]:
output[['estimation_status','source_citation','source_link','CI_source','data_choice_notes']].sort_values('estimation_status').drop_duplicates()

Unnamed: 0,estimation_status,source_citation,source_link,CI_source,data_choice_notes
56,lack of evidence,,na,extraction,
162,multiplicative,,,modeling,
54,multiplicative,,na,modeling,
123,multiplicative,na,na,modeling,
193,multiplicative + rejection sampling,,,modeling,
134,multiplicative + rejection sampling,na,na,modeling,
311,na,"Hess, S. Y., Brown, K. H., Sablah, M., Engle-S...",https://journals.sagepub.com/doi/pdf/10.1177/1...,modeling,Only one source. Discarded urban and rural spe...
127,na,"Ferguson, Elaine et al. “Zinc, iron and calciu...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,modeling,Excluded central bureau of stats total pop num...
130,na,"Central Bureau of Statistics (Kenya), UK Depar...",NID 7375,modeling,Only one source.
226,na,GFDX,https://fortificationdata.org/country-fortific...,modeling,Keeping GFDx value (95.2%) for total populatio...


In [None]:
# output[final_cols + ['u5_applicable','wra_applicable']].to_csv(all_inputted_data_path)

# output[final_cols + ['u5_applicable','wra_applicable']].to_csv('/ihme/homes/beatrixh/repos/scratch/lsff_input_coverage_data.csv')

In [None]:
A = output[(output.value_description=="percent of population eating vehicle")].rename(columns={'value_mean':"A",'estimation_status':'A_estimation_status'})
B = output[(output.value_description=="percent of population eating industrially produced vehicle")].rename(columns={'value_mean':"B",'estimation_status':'B_estimation_status'})
C = output[(output.value_description=="percent of population eating fortified vehicle")].rename(columns={'value_mean':"C",'estimation_status':'C_estimation_status'})

A = A.drop(columns=['value_description','nutrient'])
B = B.drop(columns=['value_description','nutrient'])
C = C.drop(columns=['value_description'])

ref2 = A.merge(B, on = ['location_id','location_name','vehicle'], how = 'left').merge(C, on = ['location_id','location_name','vehicle'], how = 'outer')[['location_name','vehicle','nutrient','A','A_estimation_status','B','B_estimation_status','C','C_estimation_status']]


In [None]:
ref2[(ref2.B < ref2.C)]

In [None]:
ref2[(ref2.B.isna() | ref2.C.isna())]

In [None]:
## bug fixes

In [None]:
output.loc[(output.location_name=="Niger"),'location_id'] = 213

In [None]:
output.shape, output.drop_duplicates().shape

In [None]:
output = output.drop_duplicates()

In [None]:
output[(output.value_mean.isna()) | (output.value_025_percentile.isna()) | (output.value_975_percentile.isna())]

In [None]:
all_inputted_data_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_input_coverage_data.csv'

In [None]:
break

In [None]:
output[(output.location_name=="Kenya") & (output.vehicle.isin(['maize flour','wheat flour'])) & ((output.nutrient=="folic acid") | (output.value_description=="percent of the population eating industrially produced vehicle"))]

In [None]:
output[(output.location_name=="Kenya") & (output.vehicle.isin(['maize flour','wheat flour'])) & ((output.value_description=="percent of population eating industrially produced vehicle"))]

In [None]:
output[final_cols + ['u5_applicable','wra_applicable']].to_csv(all_inputted_data_path, index = False)
output[final_cols + ['u5_applicable','wra_applicable']].to_csv('/ihme/homes/beatrixh/repos/scratch/lsff_input_coverage_data.csv', index = False)

## generate data richness scores

In [None]:
ref2[['A_estimation_status','B_estimation_status','C_estimation_status']].drop_duplicates()

In [None]:
ref2['score_from'] = ref2.A_estimation_status + '/' + ref2.B_estimation_status + '/' + ref2.C_estimation_status

In [None]:
score_map = {
    'regression/regression/regression': 0,
     'regression/na/regression': 1,
     'na/na/na': 5,
     'na/na/regression': 3,
     'na/na/multiplicative': 4,
     'na/multiplicative/multiplicative': 3,
     'na/na/lack of evidence': 4,
     'na/multiplicative/regression': 1,
     'na/multiplicative + rejection sampling/na + rejection sampling': 0,
     'na/na + rejection sampling/multiplicative + rejection sampling': 0,
     'regression/na/na': 5,
     'na/multiplicative/na': 4,
     'na/regression/multiplicative': 3,
     'na/regression/regression': 1,
     'na/na + rejection sampling/regression': 2,
     'na/na + rejection sampling/multiplicative': 2,
     'regression/regression/na': 2,
     'regression/regression + rejection sampling/regression + rejection sampling': 0,
     'na/regression/na': 4
    }

In [None]:
convert_4_pts = {
    0:0,
    1:1,
    2:2,
    3:2,
    4:3,
    5:4
}

In [None]:
convert_4_pts[score_map['na/na/na']]

In [None]:
ref2['score'] = ref2.score_from.map(score_map).map(convert_4_pts)

In [None]:
break

In [None]:
ref2.A_estimation_status.unique()

In [None]:
ref2.B_estimation_status.unique()

In [None]:
ref2.C_estimation_status.unique()

In [None]:
df[(df.value_mean.isna())]

In [None]:
break

In [None]:
## rejection sampling

In [None]:
df.columns

In [None]:
df.loc[(df.location_name.isin(['Angola', 'China', 'Ghana', 'Niger', 'Egypt', 'Sudan', 'Madagascar']))][['location_name', 'sub_population', 'vehicle',
       'value_description', 'nutrient', 'value_mean', 'value_025_percentile',
       'value_975_percentile', 
       'estimation_status', 'source_citation', 'source_link', 'CI_source',]].to_csv('/ihme/homes/beatrixh/repos/scratch/tier5_3_31_2021.csv', index = False)

In [None]:
break

In [None]:
# df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
#    (df.value_description=="percent of population eating indusrially produced vehicle") & 
#    (df.reg_A.notna()),'estimation_status'] = 'regression'

# df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
#    (df.value_description=="percent of population eating vehicle") & 
#    (df.reg_A.notna()),'value_mean'] = df.reg_A

In [None]:
break

In [None]:
# impute A
ref = ref.merge(eats, on = ['location_name','vehicle'], how = 'outer')
ref.loc[(ref.A.isna()),'A'] = ref.reg_A

In [None]:
df = ref.copy()

In [None]:
# merge on B and filter
est = pd.concat([pd.read_csv(path) for path in paths_B])

est = est.groupby(['location_name','vehicle']).mean().reset_index()

df = df.merge(est, on = ['location_name','vehicle'], how = 'outer')

draws = [f'draw_{i}' for i in range(500)]

for draw in draws:
    df.loc[(df.A < df[draw]) | 
                 (df.C > df[draw]),draw] = np.nan

df['reg_B'] = df[draws].mean(axis=1)
df = df.drop(columns = draws)

# these are rows you still dont have estimates for
# df[(df.B.isna()) & (df.reg_B.isna())]

df.loc[(df.B.isna()) | (df.B_estimation_status=='regression'),'B'] = df.reg_B

In [None]:
df[(df.A < df.B) | (df.B < df.C)]

In [None]:
df[(df.B < df.C) & (df.C_estimation_status=="regression")]

In [None]:
# merge on C and filter
est = pd.concat([pd.read_csv(path) for path in paths_C])

est = est.groupby(['location_name','vehicle']).mean().reset_index()

df = df.merge(est, on = ['location_name','vehicle'], how = 'outer')

draws = [f'draw_{i}' for i in range(500)]

for draw in draws:
    df.loc[(df.A < df[draw]) | 
                 (df.B < df[draw]),draw] = np.nan

df['reg_C'] = df[draws].mean(axis=1)
df = df.drop(columns = draws)

# these are rows you still dont have estimates for
# df[(df.B.isna()) & (df.reg_B.isna())]

df.loc[(df.C.isna()) | (df.C_estimation_status=='regression'),'C'] = df.reg_C

In [None]:
df[(df.B < df.C) & (df.C_estimation_status=="regression")]

In [None]:
df[(df.A < df.B) | (df.B < df.C)].shape

In [None]:
ref[(ref.A < ref.B) | (ref.B < ref.C)].shape

In [None]:
df[(df.A < df.B)]

In [None]:
df[(df.B < df.C)]

In [None]:
df[(df.A.isna() | df.B.isna() | df.C.isna())]

In [None]:
df

In [None]:
temp_a = df[['location_name','vehicle','nutrient','A']].rename(columns = {'A':'value_mean_est'}).drop_duplicates()
temp_a['value_description'] = "percent of the population eating vehicle"

temp_b = df[['location_name','vehicle','nutrient','B']].rename(columns = {'B':'value_mean_est'}).drop_duplicates()
temp_b['value_description'] = "percent of the population eating industrially produced vehicle"

temp_c = df[['location_name','vehicle','nutrient','C']].rename(columns = {'C':'value_mean_est'}).drop_duplicates()
temp_c['value_description'] = "percent of the population eating fortified vehicle"

In [None]:
imputed = pd.concat([temp_a,temp_b,temp_c])

In [None]:
imputed.head()

In [None]:
original = pd.read_csv(w_metadata_path)

In [None]:
test = original.merge(imputed, on = ['location_name','vehicle','value_description','nutrient'], how = 'outer')

In [None]:
test.loc[((test.value_mean.isna()) | (test.estimation_status=="regression")) & (test.value_mean_est.notna())][['location_name','vehicle','value_description','nutrient','value_mean_est']]

In [None]:
imputed.shape, original.shape

In [None]:
original.location_name.nunique()

In [None]:
imputed.location_name.nunique()

In [None]:
imputed

In [None]:
imputed['ac'] = imputed.groupby(['location_name','vehicle','value_description','nutrient']).transform('count').value_mean_est

In [None]:
imputed[(imputed.ac > 1)]

In [None]:
imputed[(imputed.ac > 1)].drop_duplicates()

In [None]:
test[test.value_mean_est > 1]

In [None]:
test.value_mean_est.min(), test.value_mean_est.max()