In [1]:
import pandas as pd, numpy as np

# Compile regression estimates / examine for logical consistency

In [2]:
w_metadata_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_input_coverage_data_with_metadata.csv'


In [3]:
w_metadata_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/lsff_input_coverage_data_with_metadata.csv'
df = pd.read_csv(w_metadata_path)

In [4]:
df.estimation_status.unique()

array(['regression', 'na', 'multiplicative', 'lack of evidence'],
      dtype=object)

In [5]:
A = df[(df.value_description=="percent of population eating vehicle")].rename(columns={'value_mean':"A",'estimation_status':'A_estimation_status'})
B = df[(df.value_description=="percent of population eating industrially produced vehicle")].rename(columns={'value_mean':"B",'estimation_status':'B_estimation_status'})
C = df[(df.value_description=="percent of population eating fortified vehicle")].rename(columns={'value_mean':"C",'estimation_status':'C_estimation_status'})

A = A.drop(columns=['value_description','nutrient'])
B = B.drop(columns=['value_description','nutrient'])
C = C.drop(columns=['value_description'])

ref = A.merge(B, on = ['location_id','location_name','vehicle'], how = 'left').merge(C, on = ['location_id','location_name','vehicle'], how = 'outer')[['location_name','vehicle','nutrient','A','A_estimation_status','B','B_estimation_status','C','C_estimation_status']]


In [6]:
ref

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
0,Ethiopia,maize flour,folic acid,59.259825,regression,27.169813,regression,0.000000,regression
1,Ethiopia,maize flour,iron,59.259825,regression,27.169813,regression,0.000000,regression
2,Ethiopia,maize flour,vitamin a,59.259825,regression,27.169813,regression,0.000000,regression
3,Ethiopia,maize flour,zinc,59.259825,regression,27.169813,regression,0.000000,regression
4,Ethiopia,oil,vitamin a,29.350000,regression,70.000000,na,0.000000,regression
...,...,...,...,...,...,...,...,...,...
220,Sudan,oil,vitamin a,50.646725,regression,80.000000,na,80.000000,na
221,Sudan,wheat flour,folic acid,88.758036,regression,35.365713,regression,1.760469,regression
222,Sudan,wheat flour,iron,88.758036,regression,35.365713,regression,1.760469,regression
223,Sudan,wheat flour,vitamin a,88.758036,regression,35.365713,regression,1.760469,regression


In [7]:
ref[(ref.A < ref.B) | (ref.B < ref.C)]

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
4,Ethiopia,oil,vitamin a,29.350000,regression,70.000000,na,0.00000,regression
5,Ethiopia,oil,vitamin a,29.350000,regression,50.000000,na,0.00000,regression
73,Burkina Faso,oil,vitamin a,92.230000,na,51.975594,multiplicative,75.00000,na
74,Burkina Faso,wheat flour,folic acid,48.000000,na,52.300000,multiplicative,32.16450,multiplicative
75,Burkina Faso,wheat flour,iron,48.000000,na,52.300000,multiplicative,17.38975,multiplicative
...,...,...,...,...,...,...,...,...,...
212,Niger,wheat flour,folic acid,41.464396,regression,44.595706,regression,30.00000,na
213,Niger,wheat flour,iron,41.464396,regression,44.595706,regression,30.00000,na
214,Niger,wheat flour,vitamin a,41.464396,regression,44.595706,regression,30.00000,regression
215,Niger,wheat flour,zinc,41.464396,regression,44.595706,regression,30.00000,regression


In [8]:
ref.location_name.nunique()

25

In [9]:
oil_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_oil_regression_estimates_3_23_2021.csv'
wheat_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_wheat_regression_estimates_3_22_2021.csv'
maize_est_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_maize_regression_estimates_3_22_2021.csv'

ip_wheat_path =  '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_wheat_regression_estimates_3_31_2021.csv'
ip_oil_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_oil_regression_estimates_3_31_2021.csv'
ip_maize_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortifiable_maize_regression_estimates_3_31_2021.csv'

fort_wheat_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_wheat_regression_estimates_3_31_2021.csv'
fort_maize_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_maize_regression_estimates_3_31_2021.csv'
fort_oil_path = '/ihme/homes/beatrixh/vivarium_research_lsff/data_prep/outputs/pct_eating_fortified_oil_regression_estimates_3_31_2021.csv'

In [10]:
paths_A = [oil_est_path,wheat_est_path,maize_est_path]
paths_B = [ip_wheat_path,ip_oil_path,ip_maize_path]
paths_C = [fort_wheat_path, fort_maize_path, fort_oil_path]

def format_data(paths, var, n):
    draws = [f'draw_{i}' for i in range(n)]
    df = pd.concat([pd.read_csv(path) for path in paths])
    df[var] = df[draws].mean(axis=1)
    df = df.drop(columns=draws)
    
    return df.groupby(['location_name','vehicle']).mean().reset_index()

In [11]:
scale_over_mean_map = {
    "maize flour":0.578996, 
    "oil":0.069702,
    "wheat flour":0.507994
}

In [12]:
df['scale_over_mean'] = 2 * df.vehicle.map(scale_over_mean_map)

In [13]:
eats = format_data(paths_A, 'reg_A', n = 1_000)
ip = format_data(paths_B, 'reg_B', n = 500)
fort = format_data(paths_C, 'reg_C',n = 500)

In [14]:
df = df.merge(eats, on = ['location_name','vehicle'], how = 'outer')

In [15]:
df['updated'] = False

In [16]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating vehicle") & 
   (df.reg_A.notna()),'value_mean'] = df.reg_A

In [17]:
df = df.merge(ip, on = ['location_name','vehicle'], how = 'outer')

In [18]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_A.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_B.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating industrially produced vehicle") & 
   (df.reg_B.notna()),'value_mean'] = df.reg_B

In [19]:
df = df.merge(fort, on = ['location_name','vehicle'], how = 'outer')

In [20]:
df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_A.notna()),'updated'] = True

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_B.notna()),'estimation_status'] = 'regression'

df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
   (df.value_description=="percent of population eating fortified vehicle") & 
   (df.reg_B.notna()),'value_mean'] = df.reg_C

In [21]:
df[(df.updated)].estimation_status.unique()

array(['regression'], dtype=object)

In [22]:
df['lower'] = np.clip(df.value_mean - (df.scale_over_mean * df.value_mean)/2, 0, 100)
df['upper'] = np.clip(df.value_mean + (df.scale_over_mean * df.value_mean)/2, 0, 100)

In [23]:
df.loc[(df.updated),'value_025_percentile'] = df.lower
df.loc[(df.updated),'value_975_percentile'] = df.upper

In [28]:
A = df[(df.value_description=="percent of population eating vehicle")].rename(columns={'value_mean':"A",'estimation_status':'A_estimation_status'})
B = df[(df.value_description=="percent of population eating industrially produced vehicle")].rename(columns={'value_mean':"B",'estimation_status':'B_estimation_status'})
C = df[(df.value_description=="percent of population eating fortified vehicle")].rename(columns={'value_mean':"C",'estimation_status':'C_estimation_status'})

A = A.drop(columns=['value_description','nutrient'])
B = B.drop(columns=['value_description','nutrient'])
C = C.drop(columns=['value_description'])

ref = A.merge(B, on = ['location_id','location_name','vehicle'], how = 'left').merge(C, on = ['location_id','location_name','vehicle'], how = 'outer')[['location_name','vehicle','nutrient','A','A_estimation_status','B','B_estimation_status','C','C_estimation_status']]


In [29]:
ref[(ref.A < ref.B) | (ref.B < ref.C)]

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
4,Ethiopia,oil,vitamin a,29.35,regression,70.0,na,0.0,regression
5,Ethiopia,oil,vitamin a,29.35,regression,50.0,na,0.0,regression
73,Burkina Faso,oil,vitamin a,92.23,na,51.975594,multiplicative,75.0,na
74,Burkina Faso,wheat flour,folic acid,48.0,na,52.3,multiplicative,32.1645,multiplicative
75,Burkina Faso,wheat flour,iron,48.0,na,52.3,multiplicative,17.38975,multiplicative
76,Burkina Faso,wheat flour,vitamin a,48.0,na,52.3,multiplicative,12.388562,regression
77,Burkina Faso,wheat flour,zinc,48.0,na,52.3,multiplicative,12.388562,regression
87,Kenya,wheat flour,folic acid,36.58,na,30.95,na,34.703333,multiplicative
88,Kenya,wheat flour,iron,36.58,na,30.95,na,33.83575,multiplicative
89,Kenya,wheat flour,vitamin a,36.58,na,30.95,na,34.703333,multiplicative


In [30]:
ref[(ref.A.isna() | ref.B.isna() | ref.C.isna())]

Unnamed: 0,location_name,vehicle,nutrient,A,A_estimation_status,B,B_estimation_status,C,C_estimation_status
145,Indonesia,wheat flour,vitamin a,75.761649,regression,29.655911,regression,,regression
165,China,oil,vitamin a,57.651373,regression,63.577005,regression,,regression
174,Egypt,oil,vitamin a,54.973297,regression,70.101964,regression,,regression


In [31]:
## rejection sampling

In [56]:
df.loc[(df.location_name=="Niger"),'location_id'] = 213

In [57]:
df.shape, df.drop_duplicates().shape

((350, 24), (335, 24))

In [58]:
df = df.drop_duplicates()

In [None]:
final_cols = ['location_id', 'location_name', 'sub_population', 'vehicle',
       'value_description', 'nutrient', 'value_mean', 'value_025_percentile',
       'value_975_percentile']

In [None]:
df[final_cols + ['u5_applicable','wra_applicable']].to_csv(all_inputted_data_path)

In [None]:
break

In [None]:
# df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
#    (df.value_description=="percent of population eating indusrially produced vehicle") & 
#    (df.reg_A.notna()),'estimation_status'] = 'regression'

# df.loc[((df.estimation_status=="regression") | (df.value_mean.isna())) &
#    (df.value_description=="percent of population eating vehicle") & 
#    (df.reg_A.notna()),'value_mean'] = df.reg_A

In [None]:
break

In [None]:
# impute A
ref = ref.merge(eats, on = ['location_name','vehicle'], how = 'outer')
ref.loc[(ref.A.isna()),'A'] = ref.reg_A

In [None]:
df = ref.copy()

In [None]:
# merge on B and filter
est = pd.concat([pd.read_csv(path) for path in paths_B])

est = est.groupby(['location_name','vehicle']).mean().reset_index()

df = df.merge(est, on = ['location_name','vehicle'], how = 'outer')

draws = [f'draw_{i}' for i in range(500)]

for draw in draws:
    df.loc[(df.A < df[draw]) | 
                 (df.C > df[draw]),draw] = np.nan

df['reg_B'] = df[draws].mean(axis=1)
df = df.drop(columns = draws)

# these are rows you still dont have estimates for
# df[(df.B.isna()) & (df.reg_B.isna())]

df.loc[(df.B.isna()) | (df.B_estimation_status=='regression'),'B'] = df.reg_B

In [None]:
df[(df.A < df.B) | (df.B < df.C)]

In [None]:
df[(df.B < df.C) & (df.C_estimation_status=="regression")]

In [None]:
# merge on C and filter
est = pd.concat([pd.read_csv(path) for path in paths_C])

est = est.groupby(['location_name','vehicle']).mean().reset_index()

df = df.merge(est, on = ['location_name','vehicle'], how = 'outer')

draws = [f'draw_{i}' for i in range(500)]

for draw in draws:
    df.loc[(df.A < df[draw]) | 
                 (df.B < df[draw]),draw] = np.nan

df['reg_C'] = df[draws].mean(axis=1)
df = df.drop(columns = draws)

# these are rows you still dont have estimates for
# df[(df.B.isna()) & (df.reg_B.isna())]

df.loc[(df.C.isna()) | (df.C_estimation_status=='regression'),'C'] = df.reg_C

In [None]:
df[(df.B < df.C) & (df.C_estimation_status=="regression")]

In [None]:
df[(df.A < df.B) | (df.B < df.C)].shape

In [None]:
ref[(ref.A < ref.B) | (ref.B < ref.C)].shape

In [None]:
df[(df.A < df.B)]

In [None]:
df[(df.B < df.C)]

In [None]:
df[(df.A.isna() | df.B.isna() | df.C.isna())]

In [None]:
df

In [None]:
temp_a = df[['location_name','vehicle','nutrient','A']].rename(columns = {'A':'value_mean_est'}).drop_duplicates()
temp_a['value_description'] = "percent of the population eating vehicle"

temp_b = df[['location_name','vehicle','nutrient','B']].rename(columns = {'B':'value_mean_est'}).drop_duplicates()
temp_b['value_description'] = "percent of the population eating industrially produced vehicle"

temp_c = df[['location_name','vehicle','nutrient','C']].rename(columns = {'C':'value_mean_est'}).drop_duplicates()
temp_c['value_description'] = "percent of the population eating fortified vehicle"

In [None]:
imputed = pd.concat([temp_a,temp_b,temp_c])

In [None]:
imputed.head()

In [None]:
original = pd.read_csv(w_metadata_path)

In [None]:
test = original.merge(imputed, on = ['location_name','vehicle','value_description','nutrient'], how = 'outer')

In [None]:
test.loc[((test.value_mean.isna()) | (test.estimation_status=="regression")) & (test.value_mean_est.notna())][['location_name','vehicle','value_description','nutrient','value_mean_est']]

In [None]:
imputed.shape, original.shape

In [None]:
original.location_name.nunique()

In [None]:
imputed.location_name.nunique()

In [None]:
imputed

In [None]:
imputed['ac'] = imputed.groupby(['location_name','vehicle','value_description','nutrient']).transform('count').value_mean_est

In [None]:
imputed[(imputed.ac > 1)]

In [None]:
imputed[(imputed.ac > 1)].drop_duplicates()

In [None]:
test[test.value_mean_est > 1]

In [None]:
test.value_mean_est.min(), test.value_mean_est.max()