In [31]:
import numpy as np
import pandas as pd
pd.options.display.float_format = "{:,.2f}".format
from tabulate import tabulate

In [32]:

# Download from: https://gitlab.com/labsysmed/dissecting-bias/-/blob/master/data/data_new.csv
syn_df = pd.read_csv('data/data_new.csv')[['cost_t', 'gagne_sum_t', 'program_enrolled_t']]

# Results of re-applying synpop on synthetic data preserved by Obermeyer et al. 
syn_syn_df = pd.read_csv('mysyn.csv.csv')

## Compute treatment-conditional error rates on data provided by Obermeyer et al and re-synthesized data 

In [35]:
get_enrollment_conditional_rates(syn_df, cuttoff=.55)
get_enrollment_conditional_rates(syn_syn_df, cuttoff=.55)

\begin{tabular}{rlrrr}
\hline
    & Population      &   FPR &   FNR &     N \\
\hline
  0 & Full population &  0.37 &  0.38 & 48784 \\
  1 & Unenrolled      &  0.37 &  0.39 & 48332 \\
  2 & Enrolled        &  0.64 &  0.13 &   452 \\
\hline
\end{tabular}
\begin{tabular}{rlrrr}
\hline
    & Population      &   FPR &   FNR &     N \\
\hline
  0 & Full population &  0.36 &  0.39 & 48784 \\
  1 & Unenrolled      &  0.36 &  0.39 & 48360 \\
  2 & Enrolled        &  0.65 &  0.14 &   424 \\
\hline
\end{tabular}


In [34]:
def FNR(a, b):
    return ((a == False) & (b == True)).sum() / (b == True).sum()

def FPR(a, b):
    return ((a == True) & (b == False)).sum() / (b == False).sum()

def get_enrollment_conditional_rates(healthdf, cuttoff=.55):

    acceptance_cuttoff = .55 # Physician referral cutoff
    proxy = 'total_cost'

    cost_screen_in = healthdf['cost_t'] > np.quantile(healthdf['cost_t'], acceptance_cuttoff)
    health_screen_in = healthdf['gagne_sum_t'] > np.quantile(healthdf['gagne_sum_t'], acceptance_cuttoff)

    outcome_df = pd.concat([cost_screen_in, health_screen_in, healthdf['program_enrolled_t']], axis=1)
    outcome_df.rename(columns={
                'cost_t': 'total_cost',
                 'gagne_sum_t': 'health',
                 'program_enrolled_t': 'enrolled'
                }, inplace=True)

    control = outcome_df[outcome_df['enrolled'] == 0]
    treatment = outcome_df[outcome_df['enrolled'] == 1]
    obs = outcome_df

    obs_fpr = FPR(obs[proxy], obs["health"])
    obs_fnr = FNR(obs[proxy], obs["health"])

    control_fpr = FPR(control[proxy], control["health"])
    control_fnr = FNR(control[proxy], control["health"])

    treatment_fpr = FPR(treatment[proxy], treatment["health"])
    treatment_fnr = FNR(treatment[proxy], treatment["health"])

    obs_conditions = healthdf['gagne_sum_t'].mean()
    control_conditions = healthdf['gagne_sum_t'][healthdf['program_enrolled_t'] == 0].mean()
    treatment_conditions = healthdf['gagne_sum_t'][healthdf['program_enrolled_t'] == 1].mean()

    obs_cost = healthdf['cost_t'].mean()
    control_cost = healthdf['cost_t'][healthdf['program_enrolled_t'] == 0].mean()
    treatment_cost = healthdf['cost_t'][healthdf['program_enrolled_t'] == 1].mean()

    proxy_metrics = pd.DataFrame({
        'Population': ['Full population', 'Unenrolled', 'Enrolled'],
        'FPR': [obs_fpr, control_fpr, treatment_fpr],
        'FNR': [obs_fnr, control_fnr, treatment_fnr],
        'N': [healthdf.shape[0],(healthdf['program_enrolled_t'] == 0).sum(), (healthdf['program_enrolled_t'] == 1).sum()]
    })

    print(tabulate(proxy_metrics, headers = 'keys', tablefmt = 'latex', floatfmt=".2f"))
