In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pymysql 
import random
import seaborn as sns
import sys
from scipy.special import logit, expit
from scipy.stats import norm
from statsmodels.stats.proportion import proportion_confint

from db_queries import get_age_metadata, get_cod_data, get_covariate_estimates, get_demographics, get_location_metadata
from elmo import get_crosswalk_version


# Specify run parameters here
RELEASE = 16         # The GBD release id for which you're running the code
LEVEL_3 = 'ints'     #'ints' # Either 'ints' or 'intest'
N_DRAWS = 100        # Number of draws to propogate uncertainty to data point (file does not export draws, so is independant of GBD)
YEAR_BIN_WIDTH = 10  # We collapse deaths across years to reduce noise and output data size; indicate number of years to combine here


# Inputs below are unlikely to need to be modified unless we change other pieces of the pipeline
INPUT_DIR  = f'FILEPATH'
HIV_RR_FILE = os.path.join(INPUT_DIR, 'ints_hiv_rr_estimates.csv')
CFR_FILE = os.path.join(INPUT_DIR, 'cfr_estimates.csv')

DEMOG_VARS = ['location_id', 'year_id', 'age_group_id', 'sex_id']

if LEVEL_3 == 'ints':
    BUNDLE = 3023
    CID = 959
elif LEVEL_3 == 'intest':
    BUNDLE = 555
    CID = [319, 320]
else:
    sys.exit('Value of LEVEL_3 must be either "ints" or "intest"')

In [32]:
# Pull in the levels of the demographic variables & age group metadata
demog = get_demographics('epi', release_id = RELEASE)
gbd_years = list(range(np.min(demog['year_id']), np.max(demog['year_id'])+1))

age_meta = get_age_metadata(release_id = RELEASE)[['age_group_id', 'age_group_years_start', 'age_group_years_end']]
age_meta.columns = ['age_group_id', 'age_start', 'age_end']

In [33]:
# We only want to use data from 'data-rich' countries.  We can pull that list as locs with parent_id 44640 in location_set 43
dr_loc_meta = get_location_metadata(43, release_id = RELEASE)
dr_locs = dr_loc_meta.loc[dr_loc_meta.parent_id == 44640, ]['location_id'].tolist()

# Pull in location metadata to get names, region info, etc. 
# note: these columns don't contain useful info with location_set 43, so need this seperate pull
loc_meta = get_location_metadata(35, release_id = RELEASE)
loc_meta = loc_meta[['location_id', 'location_name', 'is_estimate', 'region_id', 'region_name', 'super_region_name', 'super_region_id', 'ihme_loc_id', 'level']]

In [34]:
# Pull in the mortality data from the CoD database for only data-rich locations
cod_data = get_cod_data(cause_id = CID, release_id = RELEASE, location_id = dr_locs, year_id = gbd_years)

# Keep only rows with vital-registration data (data_type 9), with sample of at least 10, and exclude aggregate age groups (22 and 27)
cod_data = cod_data[(cod_data['data_type_id'] == 9) & (cod_data['sample_size'] >= 10) & (~cod_data['age_group_id'].isin([22, 27]))]

# Keep only needed columns 
cod_data = cod_data[DEMOG_VARS + ['pop', 'env', 'cf', 'sample_size', 'source_id', 'nid', 'cause_id']]

In [35]:
# We're only going to keep data from locations that reported cause-specific deaths for at least 5 years
# First, find the number of years of data for each location
n_years_by_loc = cod_data[['location_id', 'year_id', 'cause_id']].drop_duplicates()[['location_id', 'cause_id']].value_counts()
n_years_by_loc = n_years_by_loc.to_frame().reset_index()
n_years_by_loc.columns = ['location_id', 'cause_id', 'n_years']


In [36]:
# Merge the number of years of data by location with cod_data, and drop locations with < 5 years
cod_data = cod_data.merge(n_years_by_loc, on = ['location_id', 'cause_id'], how = 'left')
cod_data = cod_data[(cod_data['n_years'] >= 5)]

# Calculate the number of deaths as the product of the cause-fraction and total mortality envelop
cod_data['deaths'] = cod_data['cf'] * cod_data['env']

# Round year to the nearest 5, and collapse variables by rounded year
cod_data['year_bin'] = ((cod_data['year_id'] / YEAR_BIN_WIDTH).round(0) * YEAR_BIN_WIDTH).astype(int)

In [37]:
# Collapse to aggregate like data points within each 5-year interval 
# (we don't need annual resolution, and this aggregation reduces file size, 
#  speeds up model runtimes, & reduces noise by increasing counts)
cod_data = cod_data.groupby(['location_id', 'year_bin', 'age_group_id', 'sex_id', 'source_id', 'cause_id']) \
    .agg(pop = ('pop', 'sum'), 
         env = ('env', 'sum'), 
         sample_size = ('sample_size', 'sum'), 
         deaths = ('deaths', 'sum'),
         year_start = ('year_id', 'min'),
         year_end = ('year_id', 'max'),
         year_id = ('year_id', lambda year_id: year_id.mean().astype(int))).reset_index()


In [39]:
# We need to pull draws of the cause-fraction with uncertainty.  Obvious approach is with random binomial, but given that we
# have fractional deaths (due to redistribution) that won't work.  Instead, we'll treat the proportions as being 
# logit-normal distributed, and estimate the SE of logit-CF from the CI of the proportion.  Using the Wilson method
# for binomial CIs produces CI bounds that are symmetric about the point in logit space, which fits with our assumption
# of logit-normality.  Calculating here with other cod data managment, but will use when we pull draws further down. 
cod_data['cause_fraction'] = cod_data['deaths'] / cod_data['env']
cause_fraction_bounds = proportion_confint(cod_data['cause_fraction'] * cod_data['sample_size'], cod_data['sample_size'], method = 'wilson')

cod_data['cause_fraction_lower'] = cause_fraction_bounds[0]
cod_data['cause_fraction_upper'] = cause_fraction_bounds[1]

cod_data['cause_fraction_logit_se'] = np.maximum((logit(cod_data['cause_fraction_upper']) - logit(cod_data['cause_fraction'])), 
                                                 (logit(cod_data['cause_fraction']) - logit(cod_data['cause_fraction_lower'])))
cod_data['cause_fraction_logit_se'] =  cod_data['cause_fraction_logit_se'] / norm.ppf(0.975)


# Reshape to wide on cause_id (we're going to do draw-based estimation row-wise and need typhoid and paratyphoid on same row to do so)
cod_data = cod_data.pivot(index = ['location_id', 'age_group_id', 'sex_id', 'year_start', 'year_end', 'year_id', 'source_id'], 
                          columns = 'cause_id', 
                          values = ['deaths', 'env', 'pop', 'cause_fraction', 'cause_fraction_lower', 'cause_fraction_upper', 'cause_fraction_logit_se'])
cod_data.reset_index(inplace = True)
cod_data.columns = cod_data.columns.map(lambda x: ' '.join([str(level) for level in x]).strip().replace(" ", "_"))

    
# Merge location metadata into the cod data
cod_data = cod_data.merge(loc_meta, on = 'location_id', how = 'left') 

# Merge age metadata into the cod data
cod_data = cod_data.merge(age_meta, on = 'age_group_id', how = 'left') 


# Create a dictionary that contains all the unique values of the demographic variables in the cod data
# We'll use these values to restrict dimensions of other database pulls
cd_dims = {}
for dim in DEMOG_VARS:
    cd_dims[dim] = cod_data[dim].drop_duplicates().tolist()

In [None]:
if LEVEL_3 == 'intest':
    loc_meta = get_location_metadata(35, release_id = RELEASE)
    loc_meta = loc_meta[loc_meta['location_id'].isin(cod_data['location_id'].unique())]
    loc_meta['country_id'] =  loc_meta['path_to_top_parent'].str.split(',').str.get(3).astype(int)
    loc_meta = loc_meta[['location_id', 'country_id']]

    # Connect to the shared database, open the cursor, and execute the SQL query
    db = pymysql.connect(host = "ADDRESS", user = "USERNAME", password = "PASSWORD", database = "DATABASE") 

    # Execute the SQL query and fetch all matching data
    with db:
        with db.cursor() as cursor:
            cursor.execute(f"SELECT location_id, location_metadata_value AS income, location_metadata_version_id \
                           FROM location_metadata_history WHERE location_metadata_type_id = 12 AND location_id IN {tuple(loc_meta['country_id'].unique())}")
            covar_data = pd.DataFrame(cursor.fetchall(), columns = ['country_id', 'income', 'version'])

        # The query returns multiple verisons -- keep only the most recent
        income = covar_data.loc[covar_data.groupby('country_id')['version'].idxmax()]  

    # Need to replace all spaces in income category with underscores to match file names
    income['income'] = income['income'].replace(" ", "_", regex = True).replace(",", "", regex = True)
    income = income.drop(columns = 'version')

    # Merge income and location metadata to get income for each GBD location with CoD data
    income = loc_meta.merge(income, on = 'country_id', how = 'left')
    income.loc[income['location_id'].isin([8, 369, 374, 413]), 'income'] = "Upper_middle_income"
    income.loc[income['location_id'].isin([320]), 'income'] = "High_income_nonOECD"



    # Read in the correct file and return the df
    cf_draws = []
    for income_level in income['income'].unique():
        tmp = pd.read_csv(os.path.join(INPUT_DIR, f'cfDrawsByIncomeAndAge_{income_level}.csv'))
        tmp['income'] = income_level
        cf_draws.append(tmp)

    cf_draws = pd.concat(cf_draws, ignore_index = True)

    cf_draws['cause_id'] = np.where(cf_draws['cause'] == 'paratyphoid', 320,
                               np.where(cf_draws['cause'] == 'typhoid', 319, np.nan)).astype(int)

    cf_draws = income.merge(cf_draws, on = 'income', how = 'outer')
    cf_draws = cf_draws.drop(columns = ['cause', 'country_id', 'income'])
 
    # Sample N_DRAWS from draws in input data
    cf_draws['draw_num'] = cf_draws['draw'].str.split('_').str.get(1).astype(int)
    keep_draws = random.sample(range(cf_draws['draw_num'].max() + 1), N_DRAWS)
    cf_draws = cf_draws.loc[cf_draws['draw_num'].isin(keep_draws)]

    cfr = cf_draws.groupby(['location_id', 'age_group_id', 'cause_id']).agg({'case_fatality': ['mean', 'std']}).reset_index()
    cfr.columns = ['location_id', 'age_group_id', 'cause_id', 'cfr_mean', 'cfr_se']
    cfr = cfr.pivot(index = ['location_id', 'age_group_id'], columns = 'cause_id',
                    values = ['cfr_mean', 'cfr_se']).reset_index()

    cfr.columns = cfr.columns.map(lambda x: ' '.join([str(level) for level in x]).strip().replace(" ", "_"))

    cod_data = cod_data.merge(cfr, on = ['location_id', 'age_group_id'], how = 'left')

        

    # We have very small fractional counts (rather than integers), and frequent zeros.
    # This creates problems for common approaches to creating draws (e.g. random binomial functions require integers,
    # log- and logit-normal distributions break with zeros, etc.).
    # With that, we're going to use closed-form solutions to propagate uncertainty, instead of draws.  
    # This has the additional advantage of running far more quickly.  That said, the math makes for some
    # clunky code, and to deal with assymmetric UIs, we're treating the SE of upper bounds and lower bounds
    # as distinct.  The equations are:
    # se_product = np.sqrt((y_mean**2 * x_se**2) + (x_mean**2 * y_se**2))
    # se_quotient = np.sqrt((x_se / x_mean)**2 + (y_se / y_mean)**2) * abs(x_mean / y_mean)
    # se_sum = np.sqrt(x_se**2 + y_se**2)

    for cid in [319, 320]:
        cod_data[f'deaths_lower_se_{cid}'] = cod_data[f'env_{cid}'] * (cod_data[f'cause_fraction_{cid}'] - cod_data[f'cause_fraction_lower_{cid}']) / norm.ppf(0.975)
        cod_data[f'deaths_upper_se_{cid}'] = cod_data[f'env_{cid}'] * (cod_data[f'cause_fraction_upper_{cid}'] - cod_data[f'cause_fraction_{cid}']) / norm.ppf(0.975)

        cod_data[f'cases_point_{cid}'] = cod_data[f'deaths_{cid}'] / cod_data[f'cfr_mean_{cid}']
        cod_data[f'cases_lower_se_{cid}'] = np.sqrt((cod_data[f'deaths_lower_se_{cid}'] / cod_data[f'deaths_{cid}'])**2 \
                                                    + (cod_data[f'cfr_se_{cid}'] / cod_data[f'cfr_mean_{cid}'])**2) \
                                                    * abs(cod_data[f'deaths_{cid}'] / cod_data[f'cfr_mean_{cid}']) 

        cod_data[f'cases_upper_se_{cid}'] = np.sqrt((cod_data[f'deaths_upper_se_{cid}'] / cod_data[f'deaths_{cid}'])**2 \
                                                  + (cod_data[f'cfr_se_{cid}'] / cod_data[f'cfr_mean_{cid}'])**2) \
                                                  * abs(cod_data[f'deaths_{cid}'] / cod_data[f'cfr_mean_{cid}']) 


    # Estimate total (typhoid + paratyphoid) cases and UIs
    cod_data['cases_point'] = cod_data['cases_point_319'] + cod_data['cases_point_320'] 

    cod_data['cases_lower_se'] = np.sqrt(cod_data['cases_lower_se_319']**2 + cod_data['cases_lower_se_320']**2)
    cod_data['cases_upper_se'] = np.sqrt(cod_data['cases_upper_se_319']**2 + cod_data['cases_upper_se_320']**2)
    
    # If either typhoid or paratyphoid are zero pull the UIs from the non-missing cause 
    # (ie. if there are no paratyphoid cases then total cases are just equal to typhoid) 
    cod_data.loc[cod_data['cases_point_319']==0, 'cases_lower_se'] = cod_data['cases_lower_se_320']
    cod_data.loc[cod_data['cases_point_320']==0, 'cases_lower_se'] = cod_data['cases_lower_se_319']
    cod_data.loc[cod_data['cases_point_319']==0, 'cases_upper_se'] = cod_data['cases_upper_se_320']
    cod_data.loc[cod_data['cases_point_320']==0, 'cases_upper_se'] = cod_data['cases_upper_se_319']

    cod_data['cases_lower'] = cod_data['cases_point'] - cod_data['cases_lower_se'] * norm.ppf(0.975)
    cod_data['cases_upper'] = cod_data['cases_point'] + cod_data['cases_upper_se'] * norm.ppf(0.975)
    


    cod_data.loc[cod_data['cases_lower']<0, 'cases_lower'] = 0
    cod_data.loc[cod_data['cases_upper'] > cod_data['pop_319'], 'cases_upper'] = cod_data['pop_319']
    
    cod_data['incidence_point'] = cod_data['cases_point'] / cod_data[['pop_319', 'pop_320']].mean(axis = 1)
    cod_data['incidence_lower'] = cod_data['cases_lower'] / cod_data[['pop_319', 'pop_320']].mean(axis = 1)
    cod_data.loc[cod_data['incidence_point']==0, 'incidence_lower'] = 0
    cod_data['incidence_upper'] = cod_data['cases_upper'] / cod_data[['pop_319', 'pop_320']].mean(axis = 1)
    cod_data.loc[cod_data['incidence_point']==1, 'incidence_upper'] = 1
    
    # Estimate proportion of all cases that are paratyphoid and UIs
    cod_data['pr_para_point'] = cod_data['cases_point_320'] / cod_data['cases_point']

    cod_data['pr_para_lower_se'] = np.sqrt((cod_data['cases_lower_se_320'] / cod_data['cases_point_320'])**2 + \
                                           (cod_data['cases_lower_se'] / cod_data['cases_point'])**2) * \
                                            abs(cod_data['cases_point_320'] / cod_data['cases_point'])
    
    cod_data['pr_para_upper_se'] = np.sqrt((cod_data['cases_upper_se_320'] / cod_data['cases_point_320'])**2 + \
                                           (cod_data['cases_upper_se'] / cod_data['cases_point'])**2) * \
                                            abs(cod_data['cases_point_320'] / cod_data['cases_point'])

    pr_para_bounds = proportion_confint(cod_data['cases_point_320'], cod_data['cases_point'], method = 'wilson')
    cod_data['pr_para_lower'] = pr_para_bounds[0]
    cod_data['pr_para_upper'] = pr_para_bounds[1]

    cod_data.loc[cod_data['pr_para_lower'] < 0, 'pr_para_lower'] = 0
    cod_data.loc[cod_data['pr_para_upper'] > 1, 'pr_para_upper'] = 1 

    
    # Reshape to long, clean up variable names and add a bundle_id column to prep for bundle upload
    inc_tmp = cod_data[['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'incidence_point', 'incidence_lower', 'incidence_upper', 'cases_point_319', 'cases_point_320']]
    inc_tmp.columns = ['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'mean', 'lower', 'upper', 'typhoid_cases', 'paratyphoid_cases']
    inc_tmp = inc_tmp.query('typhoid_cases.notnull() & paratyphoid_cases.notnull()')
    inc_tmp['bundle_id'] = 556
    inc_tmp['measure'] = 'incidence'


    prp_tmp = cod_data[['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'pr_para_point', 'pr_para_lower', 'pr_para_upper', 'cases_point_319', 'cases_point_320']]
    prp_tmp.columns = ['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'mean', 'lower', 'upper', 'typhoid_cases', 'paratyphoid_cases']
    prp_tmp = prp_tmp.query('(typhoid_cases > 0 | paratyphoid_cases > 0) & typhoid_cases.notnull() & paratyphoid_cases.notnull()')
    prp_tmp['bundle_id'] = 18
    prp_tmp['measure'] = 'proportion'


    for_bundle = pd.concat([inc_tmp, prp_tmp])

    for_bundle.loc[for_bundle['mean']==0, 'lower'] = 0
    for_bundle.loc[for_bundle['mean']==1, 'upper'] = 1

    # Read in the case-fatality estimates & retain only rows for non-HIV attribuable 
    # (since HIV-attributable would have been coded as HIV deaths)
    case_fatality = pd.read_csv(CFR_FILE).query('est_pr_hiv == 0').drop(columns = ['est_pr_hiv'])

    # Merge case-fatality estimates into cod data
    cod_data = cod_data.merge(case_fatality, on = ['location_id', 'year_id', 'age_group_id'], how = 'left')

    # Read in the estimates of HIV prevalence, keep needed columns and clean up variable names
    hiv_prev = get_covariate_estimates(covariate_id = 49, 
                                       location_id = cd_dims['location_id'],
                                       year_id = cd_dims['year_id'],
                                       age_group_id = cd_dims['age_group_id'],
                                       release_id = RELEASE)
    
    hiv_prev = hiv_prev[DEMOG_VARS + ['mean_value', 'lower_value', 'upper_value']]

    hiv_prev = hiv_prev.rename(columns = {'mean_value':'hiv_prev_mean', 
                                          'lower_value':'hiv_prev_lower', 
                                          'upper_value':'hiv_prev_upper'})

In [None]:
cod_data = cod_data.merge(hiv_prev, on = DEMOG_VARS, how = 'left')


In [15]:

if LEVEL_3 == 'ints':
    # Read in the case-fatality estimates & retain only rows for non-HIV attribuable 
    # (since HIV-attributable would have been coded as HIV deaths)
    case_fatality = pd.read_csv(CFR_FILE).query('est_pr_hiv == 0').drop(columns = ['est_pr_hiv'])

    # Merge case-fatality estimates into cod data
    cod_data = cod_data.merge(case_fatality, on = ['location_id', 'year_id', 'age_group_id'], how = 'left')

    # Read in the estimates of HIV prevalence, keep needed columns and clean up variable names
    hiv_prev = get_covariate_estimates(covariate_id = 49, 
                                       location_id = cd_dims['location_id'],
                                       year_id = cd_dims['year_id'],
                                       age_group_id = cd_dims['age_group_id'],
                                       release_id = RELEASE)
    
    hiv_prev = hiv_prev[DEMOG_VARS + ['mean_value', 'lower_value', 'upper_value']]

    hiv_prev = hiv_prev.rename(columns = {'mean_value':'hiv_prev_mean', 
                                          'lower_value':'hiv_prev_lower', 
                                          'upper_value':'hiv_prev_upper'})
    
    # Merge HIV prevalence estimates into cod_data data frame
    cod_data = cod_data.merge(hiv_prev, on = DEMOG_VARS, how = 'left')
    
    # Use method of moments to determine the parameters of a gamma distribution that best fit the mean and uncertainty of HIV prevalence estimates
    # We'll use alpha and beta to pull draws of HIV prevalence from a gamma distribution in our draw-level calculations below
    cod_data['sigma'] = (cod_data['hiv_prev_upper'] - cod_data['hiv_prev_lower']) / ( 2 * norm.ppf(0.975))
    cod_data.loc[cod_data['sigma']==0, 'sigma'] = cod_data.query('hiv_prev_mean != 0 & sigma>0')['sigma'].min()
    
    cod_data['alpha'] = cod_data['hiv_prev_mean'] * (cod_data['hiv_prev_mean'] - cod_data['hiv_prev_mean']**2 - cod_data['sigma']**2) / cod_data['sigma']**2
    cod_data.loc[cod_data['alpha']<0, 'alpha'] = 0

    cod_data['beta'] = cod_data['alpha'] * (1 - cod_data['hiv_prev_mean']) / cod_data['hiv_prev_mean']
    
    # Read in the estimates of HIV-iNTS relative risks and merge into the cod_data
    hiv_rr = pd.read_csv(HIV_RR_FILE)[DEMOG_VARS + ['log_hiv_excess_risk', 'log_hiv_excess_risk_se']]
    cod_data = cod_data.merge(hiv_rr, on = DEMOG_VARS, how = 'left')
    
    # Calculate point estimate of cases (no draws, no uncertainty) 
    cod_data['cfr_point'] = expit(cod_data['logit_pred'])
    cod_data['hiv_paf_point'] = (cod_data['hiv_prev_mean'] * np.exp(cod_data['log_hiv_excess_risk'])) / (cod_data['hiv_prev_mean'] * np.exp(cod_data['log_hiv_excess_risk']) + 1)  
    cod_data['incidence_point'] = ((cod_data['deaths'] / expit(cod_data['logit_pred'])) / (1 - cod_data['hiv_paf_point'])) / cod_data['pop']
    
    # Apply calc_ints_incidence_ui function to all rows to estimate 95% UI bounds for the incidence estimate
    incidence_ui = cod_data.apply(calc_ui, axis = 1)
    incidence_ui = pd.DataFrame(incidence_ui.tolist(), columns = ['incidence_mean', 'incidence_lower', 'incidence_upper'])

    # Concat the incidence UI columns onto the cod_data data frame
    cod_data = pd.concat([cod_data.reset_index(), incidence_ui.reset_index(drop = True)], axis = 1)
    
    # Given very small counts we frequently get wild draws that make for impossible UIs (e.g. upper < mean)
    # We'll fix this by extracting the width of the UI segment and applying it to the directly calculated
    # point estimate (rather than the mean of the draws) all in logit space
    cod_data['incidence_lower'] = expit(logit(cod_data['incidence_point']) - (logit(cod_data['incidence_mean']) - logit(cod_data['incidence_lower'])))
    cod_data['incidence_upper'] = expit(logit(cod_data['incidence_point']) + (logit(cod_data['incidence_upper']) - logit(cod_data['incidence_mean'])))
    
    for_bundle = cod_data[['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'incidence_point', 'incidence_lower', 'incidence_upper']]
    for_bundle.columns = ['location_id', 'level', 'age_start', 'age_end', 'sex_id', 'year_start', 'year_end', 'mean', 'lower', 'upper']
    for_bundle['bundle_id'] = BUNDLE
    for_bundle['measure'] = 'incidence'



In [17]:
# Time to prep things for upload

for_bundle.loc[for_bundle['sex_id'] == 1, 'sex'] = 'Male'
for_bundle.loc[for_bundle['sex_id'] == 2, 'sex'] = 'Female'
for_bundle['source_type'] = 'Vital registration - national' 
for_bundle['nid'] = 292827
for_bundle['unit_type'] = 'Person'
for_bundle['unit_value_as_published'] = 1
for_bundle['measure_adjustment'] = 0
for_bundle['uncertainty_type_value'] = 95
for_bundle['urbanicity_type'] = 'Mixed/both'
for_bundle.loc[for_bundle['level'] == 3, 'representative_name'] = 'Nationally representative only'
for_bundle.loc[for_bundle['level'] > 3, 'representative_name'] = 'Representative for subnational location only'
for_bundle['recall_type'] = 'Point'
for_bundle['extractor'] = 'stanaway'
for_bundle['is_outlier'] = 0
for_bundle['cv_diag_mixed'] = 0
for_bundle['cv_passive'] = 0
for_bundle['smaller_site_unit'] = 0
for_bundle['sex_issue'] = 0
for_bundle['year_issue'] = 0
for_bundle['age_issue'] = 0
for_bundle['age_demographer'] = 0
for_bundle['measure_issue'] = 0
for_bundle['response_rate'] = np.nan


In [18]:
for_bundle.to_csv(os.path.join(INPUT_DIR, 'vr_data_for_dismod_bundle.csv'), index = False)