### Code creating a synthetic DGP using congestive heart failure and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit
import pickle

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *
from cohens_kappa import *

In [2]:
master_data = pd.read_csv('csv_files/master_data.csv')
zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-congestiveheartfailure-document.csv')
# ** lung, leg, feet
# family has most accurate and least variance predictions so far
regex_preds = regular_expression_predict(master_data['notes_half2'], ['family'])

print(np.mean(master_data['heart_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['heart_fail'], zero_shot_preds['prediction']))
print(cohens_kappa(master_data['heart_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['heart_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['heart_fail'], regex_preds))
print(cohens_kappa(master_data['heart_fail'], regex_preds))

0.7817391599606125
0.16943397507724695
{'tn': 19853, 'fp': 1820, 'fn': 4608, 'tp': 3170, 'sensitivity': 0.40755978400617127, 'specificity': 0.9160245466709731, 'precision': 0.6352705410821643, 'recall': 0.40755978400617127}
0.3655922679927455

0.7174289497809921
0.019490000339547044
{'tn': 21114, 'fp': 559, 'fn': 7763, 'tp': 15, 'sensitivity': 0.0019285163281049112, 'specificity': 0.974207539334656, 'precision': 0.02613240418118467, 'recall': 0.0019285163281049112}
-0.03394120679088907


In [3]:
semi_synthetic_data = pd.DataFrame({'U': master_data['heart_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender']})

In [4]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

print(np.mean(semi_synthetic_data['age']))
print(np.std(semi_synthetic_data['age']))

C = np.random.normal(76.4, 56.8, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.8*(semi_synthetic_data['age'] - 67)), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.5*semi_synthetic_data['age']

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

76.41380598281891
56.78193469093504
0.5110183015856847


In [14]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print(cohens_kappa(semi_synthetic_data['W'], semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

7.504150688720684
0.07298277666244009

0.8135207633017555
-0.022815985881190593

0.6429828582370919
0.6608841410161134


In [43]:
print(backdoor_adjustment('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data, 'backdoor'))

1.2918325843955856
(1.2655233635217236, 1.3164800118871192, [1.296887582784258, 1.2948778462856652, 1.3018044775218272, 1.291597786135675, 1.3002897957689328, 1.280222881487333, 1.289776851792503, 1.2704407579928798, 1.320639459125175, 1.2733072626532973, 1.279903331631857, 1.2986387724331792, 1.2827534387564299, 1.2697685158736292, 1.2953879848231367, 1.2838603051082629, 1.2998189350467442, 1.2794707036873518, 1.3122577626291516, 1.3146542219300485, 1.2817106815110506, 1.2727406837399116, 1.2927837505643254, 1.3037737650107601, 1.3206825837711733, 1.2909268706564632, 1.3057128771215574, 1.2986136707801634, 1.2660589128754154, 1.289819480935563, 1.2934052035203223, 1.2940403394370392, 1.2961135616307686, 1.286342584170228, 1.3118044387010954, 1.2985905999843936, 1.292427996328648, 1.2800369477252005, 1.2953727745612653, 1.2956586747705217, 1.294715095937022, 1.2832911712335147, 1.286472057481319, 1.3022493700280364, 1.2655398281949317, 1.289451524426866, 1.2934272208966604, 1.313060011

### Use proximal with two proxies that are just random chance.

In [10]:
np.random.seed(2)

random_classifier1 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

random_classifier2 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

semi_synthetic_data['R1'] = random_classifier1
semi_synthetic_data['R2'] = random_classifier2

ace = proximal_find_ace('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/heart_baseline.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use backdoor with Flan-T5 as the predictions.

In [8]:
ace = backdoor_adjustment('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals_backdoor('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data, 'backdoor')

with open('pickle_files/heart_backdoor.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 and Lexicon predictions.

In [9]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/heart_proximal.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)