# V&V Delivery Facility Choice Model

## Setup

In [None]:
import pandas as pd, numpy as np, os
from vivarium import Artifact
import db_queries
import matplotlib.pyplot as plt
from pathlib import Path
import yaml

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning) 

In [None]:
locations = ['Pakistan', 'Ethiopia', 'Nigeria']

In [None]:
# Parameters cell for papermill
model_dir = "model27.0"

In [None]:
base_results_dir = Path("/mnt/team/simulation_science/pub/models/vivarium_gates_mncnh/results/") / model_dir

In [None]:
results_dirs = {}
assert set([p.stem for p in base_results_dir.iterdir()]) == set([l.lower() for l in locations])
for location in locations:
    location_dir = base_results_dir / location.lower()
    timestamps = sorted(location_dir.iterdir())
    last_timestamp = timestamps[-1]
    if len(timestamps) > 1:
        print(f'Multiple timestamps: {timestamps}, using {last_timestamp}')
    results_dirs[location] = location_dir / last_timestamp / 'results'

results_dirs

In [None]:
location_ids = db_queries.get_ids('location')
location_ids = location_ids.loc[location_ids.location_name.str.lower().isin([x.lower() for x in results_dirs.keys()])]
location_ids

In [None]:
def load_yaml_file(path):
    with open(path) as stream:
        return yaml.safe_load(stream)

In [None]:
artifact_paths = {
    location: load_yaml_file(result_dir.parent / 'model_specification.yaml')['configuration']['input_data']['artifact_path']
    for location, result_dir
    in results_dirs.items()
}
artifact_paths

In [None]:
def read_artifact(key, filter_terms=['sex == Female' , 'age_start > 5', 'age_end < 60']):
    all_locations_data = []
    for location in locations:
        art = Artifact(artifact_paths[location], filter_terms=filter_terms)
        location_data = art.load(key)
        if not isinstance(location_data, pd.DataFrame):
            location_data = pd.DataFrame({'value': location_data, 'location': location}, index=[0]).set_index('location')
        else:
            location_data['location'] = location
            location_data = location_data.reset_index().set_index(['location'] + [c for c in location_data.index.names if c is not None])
        all_locations_data.append(location_data)

    all_locations_data = pd.concat(all_locations_data)
    # FIXME: Check for draw columns comprehensively, rather than only in the first column
    if 'draw' in all_locations_data.columns[0]:
        all_locations_data = all_locations_data[[f'draw_{draw}' for draw in draws]]
    else:
        # TODO: Add an assert here about how the dataframe should look
        for draw in draws:
            all_locations_data[f'draw_{draw}'] = all_locations_data['value']
        all_locations_data = all_locations_data.drop(columns='value')
    return all_locations_data

In [None]:
# TODO: Rename this variable to make clearer that it is Ethiopia only
art = Artifact(artifact_paths['Ethiopia'])
keys = art.keys

In [None]:
from vivarium_gates_mncnh.data.utilities import get_facility_choice_validation_targets

targets = get_facility_choice_validation_targets()
targets.head()

In [None]:
def read_results(result_file_name, baseline_only=True):
    all_locations_results = []
    for location, result_dir in results_dirs.items():
        location_results = pd.read_parquet(result_dir / f'{result_file_name}.parquet').drop(columns=['measure','entity','sub_entity','entity_type'])
        location_results['location'] = location
        if 'scenario' not in location_results.columns:
            location_results['scenario'] = 'baseline'
        if baseline_only:
            location_results = location_results.loc[location_results.scenario == 'baseline']

        # Aggregate over random seeds (if necessary)
        # TODO: Could use vivarium_helpers marginalize method here
        if 'random_seed' in location_results.columns:
            location_results = location_results.groupby([
                c for c in location_results.columns if c != 'random_seed' and c != 'value'
            ]).sum().reset_index().drop(columns='random_seed')

        all_locations_results.append(location_results)
    return pd.concat(all_locations_results, ignore_index=True)

In [None]:
# TODO: Clarify this observer name -- why is it called this? What does it match in the docs?
# TODO: Give this a more descriptive name than `data`
data = read_results('anc_other', baseline_only=True)
draws = data.input_draw.unique()
data

In [None]:
# NOTE: It looks like the only thing we're missing from the previous observer is child sex,
# which is why we need this one. We should deduplicate.
births_df = read_results('births', baseline_only=True)
births_df

In [None]:
# TODO: Give this a more descriptive name than `df`
# clean dataframe
df = data.copy()
df = df.loc[df.pregnancy_outcome.isin(['live_birth', 'stillbirth'])] # eliminate "other" birth outcomes
assert df.loc[df.delivery_facility_type == 'none']['value'].sum() == 0, "Unexpected presence of delivery facility == 'none'"
df = df.loc[df.delivery_facility_type != 'none'] # get rid of these rows because they are not relevant
# add in columns for ANC1 and IFD
df['anc1'] = df.anc_coverage != 'none'
df['ifd'] = df.delivery_facility_type != 'home'
df

## Univariate checks


**Baseline in-facility delivery proption compared to GBD covariate ID 51**

In [None]:
# NOTE: I think there is a vivarium_helpers method for this
baseline_ifd = (df.groupby(['input_draw','location','ifd'])['value'].sum()
                / df.groupby(['input_draw','location'])['value'].sum())
baseline_ifd = baseline_ifd.groupby(['location','ifd']).describe(percentiles=[0.025,0.975]).reset_index()
baseline_ifd = baseline_ifd.loc[baseline_ifd.ifd]
baseline_ifd

In [None]:
def return_target_data(value):
    return targets.loc[targets.probability_of == value].set_index('probability_of').rename_axis('location', axis=1).unstack().rename('target_value').reset_index()

target_ifd = return_target_data('in_facility')
target_ifd

In [None]:
ifd_plot_data = (target_ifd
                 .merge(baseline_ifd, on='location'))
ifd_plot_data


In [None]:
def plot_proportion_by_location(data):
    fig, ax = plt.subplots(figsize=(8, 5))

    # Bar positions
    x = np.arange(len(data['location']))
    width = 0.35

    # Plot bars
    ax.bar(x - width/2, data['target_value'], width, label='Target')
    if 'mean' not in data.columns:
        data['mean'] = data['value']
    ax.bar(x + width/2, data['mean'], width, label='Simulation')

    # X-axis labels and ticks
    ax.set_xticks(x)
    ax.set_xticklabels(data['location'])
    ax.set_ylabel(data.probability_of.values[0])
    ax.set_title(f'Target vs Simulation {data.probability_of.values[0]} by Location')
    ax.legend()
    ax.errorbar(x + width/2, data['mean'],
                yerr=[data['mean'] - data['2.5%'], 
                    data['97.5%'] - data['mean']],
                fmt='none', ecolor='gray', capsize=5)
    plt.grid()
    plt.tight_layout()
    plt.show()

In [None]:
plot_proportion_by_location(ifd_plot_data)

# looks good, maybe a tiny bit high

In [None]:
# check the BEMONC/CEMONC split
baseline_bemonc_frac = (df.loc[df.delivery_facility_type == 'BEmONC'].groupby(['input_draw','location'])['value'].sum()
                        / df.loc[df.ifd].groupby(['input_draw','location'])['value'].sum())
baseline_bemonc_frac = baseline_bemonc_frac.groupby('location').mean().sort_index()
baseline_bemonc_frac

In [None]:
# TODO: Use `keys` variable
[x for x in art.keys if 'facility_choice' in x]

In [None]:
target_bemonc_frac = read_artifact('cause.facility_choice.bemonc_facility_fraction').mean(axis=1).sort_index()
# note no draw-level variation

In [None]:
# TODO: refactor for duplication with the above
fig, ax = plt.subplots(figsize=(8, 5))

# Bar positions
x = np.arange(len(baseline_bemonc_frac))
width = 0.35

# Plot bars
ax.bar(x - width/2, target_bemonc_frac, width, label='Target')
ax.bar(x + width/2, baseline_bemonc_frac, width, label='Simulation')

# X-axis labels and ticks
ax.set_xticks(x)
ax.set_xticklabels(baseline_bemonc_frac.index)
ax.set_ylabel('BEmONC as a fraction of in-facility delivery')
ax.set_title(f'Target vs Simulation BEmONC fraction of In-Facility Deliveries by Location')
ax.legend()

plt.grid()
plt.tight_layout()
plt.show()

# BEmONC/CEmONC split still looking good

In [None]:
for delivery_facility in ['home','bemonc','cemonc']:
    sim = df.loc[df.delivery_facility_type.str.lower() == delivery_facility].groupby(['input_draw','location'])['value'].sum()
    sim = (sim / df.groupby(['input_draw','location'])['value'].sum()).groupby(['location']).describe(percentiles=[0.025,0.975]).reset_index()
    key = f'cause.facility_choice.probability_{delivery_facility}_birth'
    artifact_delivery_facility = read_artifact(key=key).mean(axis=1).rename('target_value').reset_index()
    plot_data = artifact_delivery_facility.merge(sim, on='location')
    plot_data['probability_of'] = f'Proportion of births at {delivery_facility}'
    plot_proportion_by_location(plot_data)

# The "home" plot is just the inverse of the IFD plot above, so it looks good too
# BEmONC and CEmONC look good
# TODO: Reduce duplication here

**Baseline ANC1 attendance vs GBD covariate ID 7**

In [None]:
baseline_anc = (df.groupby(['input_draw','location','anc1'])['value'].sum()
                / df.groupby(['input_draw','location'])['value'].sum())
baseline_anc = baseline_anc.groupby(['location','anc1']).describe(percentiles=[0.025,0.975]).reset_index()
baseline_anc = baseline_anc.loc[baseline_anc.anc1]
baseline_anc

In [None]:
target_anc = return_target_data('anc1')
target_anc

In [None]:
anc_plot_data = (target_anc
                 .merge(baseline_anc, on='location'))
anc_plot_data


In [None]:
plot_proportion_by_location(anc_plot_data)

# Looks good

In [None]:
art.load('covariate.antenatal_care_first_trimester_visit_coverage_proportion.estimate').apply(pd.DataFrame.describe, percentiles=[0.025,0.975], axis=1)
# so we have draw-level data for ANC attendance... could this be throwing us off?
# we also have draw-level data for LBWSG exposure/preterm prevalence obviously
# TODO: I don't understand what the above comment means, and we don't appear to be off to me

**Baseline ultrasound coverage at ANC**

In [None]:
baseline_us = (df.groupby(['input_draw','location','anc1','ultrasound_type'])['value'].sum()
                / df.groupby(['input_draw','location','anc1'])['value'].sum()).reset_index()
assert baseline_us.loc[(baseline_us.anc1 == False) & (baseline_us.ultrasound_type != 'no_ultrasound')]['value'].sum() == 0, "Unexpected ultrasound when no ANC"
assert baseline_us.loc[baseline_us.ultrasound_type == 'AI_assisted']['value'].sum() == 0, "Unexpected presence of AI-assisted ultrasound at baseline"
baseline_us = baseline_us.groupby(['location','anc1','ultrasound_type'])['value'].describe(percentiles=[0.025,0.975]).reset_index()
baseline_us


In [None]:
us_target = return_target_data('standard_ultrasound_given_anc1')
us_target

In [None]:
us_plot_data = us_target.merge(baseline_us.loc[baseline_us.anc1 & (baseline_us.ultrasound_type == 'standard')], on='location')
plot_proportion_by_location(us_plot_data)

# Looks good

**Baseline prevalence of preterm birth**

In [None]:
baseline_preterm = (df.groupby(['input_draw','location','preterm_birth'])['value'].sum()
                / df.groupby(['input_draw','location'])['value'].sum())
baseline_preterm = baseline_preterm.groupby(['location','preterm_birth']).describe(percentiles=[0.025,0.975]).reset_index()
baseline_preterm = baseline_preterm.loc[baseline_preterm.preterm_birth]
baseline_preterm

# this has very small levels of variation by draw which is suprising to me
# TODO: understand this

In [None]:
preterm_target = return_target_data('preterm')
preterm_target

In [None]:
# TODO: Use `keys` variable
[x for x in art.keys if 'preterm' in x]

In [None]:
art.load('cause.neonatal_preterm_birth.prevalence')[[f'draw_{draw}' for draw in draws]].mean(axis=1).groupby('child_age_start').mean()

# This is very similar to the target value, but not identical -- PAF sim stochastic uncertainty?

In [None]:
preterm_plot_data = preterm_target.merge(baseline_preterm, on=['location'])
preterm_plot_data

In [None]:
plot_proportion_by_location(preterm_plot_data)

# Looks good, or at least very close

**Live birth ratio in simulation vs. GBD covariate ID 1106**

In [None]:
baseline_sex_ratio = (births_df.loc[births_df.pregnancy_outcome=='live_birth']
                      .groupby(['input_draw','location','child_sex'])['value'].sum()
                      / births_df.loc[births_df.pregnancy_outcome=='live_birth']
                      .groupby(['input_draw','location'])['value'].sum())
baseline_sex_ratio = baseline_sex_ratio.groupby(['location','child_sex']).describe(percentiles=[0.025,0.975]).reset_index()
baseline_sex_ratio

In [None]:
target_sex_ratio = return_target_data('male')

In [None]:
sex_ratio_plot_data = target_sex_ratio.merge(baseline_sex_ratio.loc[baseline_sex_ratio.child_sex=='Male'], on=['location'])
sex_ratio_plot_data

In [None]:
plot_proportion_by_location(sex_ratio_plot_data)

# Looks good

## Bivariate checks

**Check 1:** Validate rates of preterm birth given in-facility status against optimization targets (calculated in facility choice validation targets notebook: https://github.com/ihmeuw/vivarium_research_mncnh_portfolio/blob/main/facility_choice/facility_choice_validation_targets.ipynb)

In [None]:
target_preterm_given_home = return_target_data('preterm_given_at_home')
target_preterm_given_facility = return_target_data('preterm_given_in_facility')

target_preterm_given_home

In [None]:
# TODO: extract the logic here with `cols` into a function
cols = ['input_draw','location','ifd']
delivery_facility_by_preterm = (df.groupby(cols + ['preterm_birth'])['value'].sum() 
                            / df.groupby(cols)['value'].sum()).reset_index()
delivery_facility_by_preterm = delivery_facility_by_preterm.loc[delivery_facility_by_preterm.preterm_birth]
delivery_facility_by_preterm = delivery_facility_by_preterm.groupby([x for x in cols if x != 'input_draw'])['value'].describe(percentiles=[0.025,0.975]).reset_index()
delivery_facility_by_preterm

In [None]:
plot_proportion_by_location(target_preterm_given_home
                            .merge(delivery_facility_by_preterm.loc[delivery_facility_by_preterm.ifd==False], on='location'))

# Looks pretty close, maybe a bit low in Ethiopia and high in Nigeria?

In [None]:
plot_proportion_by_location(target_preterm_given_facility
                            .merge(delivery_facility_by_preterm.loc[delivery_facility_by_preterm.ifd==True], on='location'))

# Looks good, maybe a bit high

**Check 2:** Validate rates of in-facility delivery given ANC status against optimization targets 

In [None]:
cols = ['input_draw','location','anc1']
delivery_facility_by_anc = (df.groupby(cols + ['ifd'])['value'].sum() 
                            / df.groupby(cols)['value'].sum()).reset_index()
delivery_facility_by_anc = delivery_facility_by_anc.loc[delivery_facility_by_anc.ifd].drop(columns='ifd')
delivery_facility_by_anc = delivery_facility_by_anc.groupby([x for x in cols if x != 'input_draw'])['value'].describe(percentiles=[0.025,0.975]).reset_index()
delivery_facility_by_anc

In [None]:
targets.loc[targets.probability_of.isin(['anc1','in_facility'])]

In [None]:
plot = return_target_data('in_facility_given_anc0').merge(delivery_facility_by_anc.loc[delivery_facility_by_anc.anc1==False], on='location')
plot_proportion_by_location(plot)

# Looks good, maybe a bit high

In [None]:
plot = return_target_data('in_facility_given_anc1').merge(delivery_facility_by_anc.loc[delivery_facility_by_anc.anc1], on='location')
plot_proportion_by_location(plot)

# Looks good

**Check #3:** Validate observed probabilities of IFD given believed preterm status against observed probabilities in facility choice nanosim: (https://github.com/ihmeuw/vivarium_research_mncnh_portfolio/blob/main/facility_choice/facility_choice_validation_targets.ipynb)

In [None]:
cols = ['input_draw','location','believed_preterm']
delivery_facility_by_believed_preterm = (df.groupby(cols + ['ifd'])['value'].sum() 
                            / df.groupby(cols)['value'].sum()).reset_index()
delivery_facility_by_believed_preterm = delivery_facility_by_believed_preterm.loc[delivery_facility_by_believed_preterm.ifd == False]
delivery_facility_by_believed_preterm = delivery_facility_by_believed_preterm.groupby([x for x in cols if x != 'input_draw'])['value'].describe(percentiles=[0.025,0.975]).reset_index()
delivery_facility_by_believed_preterm

In [None]:
p_home_believed_preterm = return_target_data('at_home_given_believed_preterm')
plot = p_home_believed_preterm.merge(delivery_facility_by_believed_preterm.loc[delivery_facility_by_believed_preterm.believed_preterm==True], 
                                     on=['location'])
plot['probability_of'] = 'Home delivery rate among believed preterm babies'
plot_proportion_by_location(plot)

# Looks good

In [None]:
p_home_believed_term = return_target_data('at_home_given_believed_term')
plot = p_home_believed_term.merge(delivery_facility_by_believed_preterm.loc[delivery_facility_by_believed_preterm.believed_preterm==False].rename(columns={'value':'mean'}), 
                                     on=['location'])
plot['probability_of'] = 'Home delivery rate among believed full term babies'
plot_proportion_by_location(plot)

# Looks good

**Check 4:** Validate confusion matrix of preterm status vs. believed preterm status against observed probabilities in facility choice nanosim (https://github.com/ihmeuw/vivarium_research_mncnh_portfolio/blob/main/facility_choice/facility_choice_validation_targets.ipynb)

In [None]:
confusion = (df.groupby(['location','input_draw','preterm_birth','believed_preterm'])['value'].sum()
             / df.groupby(['location','input_draw','preterm_birth'])['value'].sum())
confusion = confusion.groupby(['location','preterm_birth','believed_preterm']).describe(percentiles=[0.025,0.975]).reset_index()
confusion

In [None]:
for preterm_birth in [True, False]:
    if preterm_birth:
        true_val = 'preterm'
    else:
        true_val = 'term'
    for believed_preterm_birth in [True, False]:
        if believed_preterm_birth:
            believed_val = 'preterm'
        else:
            believed_val = 'term'
        target_title = f'believed_{believed_val}_given_{true_val}'
        target_data = return_target_data(target_title)
        plot_data = (target_data.merge(confusion.loc[(confusion.preterm_birth == preterm_birth) 
                                                     & (confusion.believed_preterm == believed_preterm_birth)], on='location'))
        plot_proportion_by_location(plot_data)

# Looks good

**Check 5:** Validate P( believed preterm | preterm status, ultrasound type ) against observed probabilities in facility choice nanosim (https://github.com/ihmeuw/vivarium_research_mncnh_portfolio/blob/main/facility_choice/facility_choice_validation_targets.ipynb)

In [None]:
check5 = (df.groupby(['location','input_draw','ultrasound_type','preterm_birth','believed_preterm'])['value'].sum()
             / df.groupby(['location','input_draw','ultrasound_type','preterm_birth'])['value'].sum())
check5 = check5.groupby(['location','ultrasound_type','preterm_birth','believed_preterm']).describe(percentiles=[0.025,0.975]).reset_index()
check5['ultrasound_type'] = np.where(check5.ultrasound_type == 'standard', 'standard_ultrasound', check5.ultrasound_type)
check5

In [None]:
for preterm_birth in [True, False]:
    if preterm_birth:
        true_val = 'preterm'
    else:
        true_val = 'term'
    for believed_preterm_birth in [True, False]:
        if believed_preterm_birth:
            believed_val = 'preterm'
        else:
            believed_val = 'term'
        for ultrasound_type in ['no_ultrasound', 'standard_ultrasound']:
            target_title = f'believed_{believed_val}_given_{true_val}_and_{ultrasound_type}'
            target_data = return_target_data(target_title)
            plot_data = (target_data.merge(check5.loc[(check5.preterm_birth == preterm_birth) 
                                                        & (check5.believed_preterm == believed_preterm_birth)
                                                        & (check5.ultrasound_type == ultrasound_type)], on='location'))
            plot_proportion_by_location(plot_data)