In [None]:
### Code creating a synthetic DGP using congestive heart failure and prediction data from zero-shot classifier and regex matching.

In [2]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit
import pickle

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *
from cohens_kappa import *

In [3]:
master_data = pd.read_csv('csv_files/master_data.csv')
zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-congestiveheartfailure-document.csv')
# ** lung, leg, feet
# family has most accurate and least variance predictions so far
regex_preds = regular_expression_predict(master_data['notes_half2'], ['family'])
# note: family seems to predict when heart failure does NOT occur, so I flip the predictions
for i in range(len(regex_preds)):
    if regex_preds[i] == 0:
        regex_preds[i] = 1
    elif regex_preds[i] == 1:
        regex_preds[i] = 0

print(np.mean(master_data['heart_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['heart_fail'], zero_shot_preds['prediction']))
print(cohens_kappa(master_data['heart_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['heart_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['heart_fail'], regex_preds))
print(cohens_kappa(master_data['heart_fail'], regex_preds))

0.7817391599606125
0.16943397507724695
{'tn': 19853, 'fp': 1820, 'fn': 4608, 'tp': 3170, 'sensitivity': 0.40755978400617127, 'specificity': 0.9160245466709731, 'precision': 0.6352705410821643, 'recall': 0.40755978400617127}
0.3655922679927455

0.28257105021900786
0.9805099996604529
{'tn': 559, 'fp': 21114, 'fn': 15, 'tp': 7763, 'sensitivity': 0.9980714836718951, 'specificity': 0.025792460665343978, 'precision': 0.2688298645981231, 'recall': 0.9980714836718951}
0.012764417597259823


In [5]:
semi_synthetic_data = pd.DataFrame({'U': master_data['heart_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender'], 'notes_half2': master_data['notes_half2']})

### Find 10 examples where the text data had the word family in it.

In [8]:
# find 10 examples
examples = []

# false positive is defined by element in W being 1 but element in U is 0
# false negative is defined by element in W being 0 but element in U is 1
for index, row in semi_synthetic_data.iterrows():
    if row['Z'] == 0 and len(examples) < 10:
        examples.append(row['notes_half2'])

    if len(examples) == 10:
        break

In [9]:
for sentence in examples:
    print(sentence)
    print()

pleted Pulses all present. No signs of hematoma at angio site. RESP: on room air with sat 94-100%. Lungs clear to ascutation RENAL: maintenance fluids @ 70. Urine output 40-50cc/hr GI: No further vomiting or nausea.  HO attempted unsuccessfully to place ng tube early in shift but great resistance so attempt aborted ENDO: BS within acceptable limits ID: afebrile, on clindamycin SKIN: Large forehead lac sutured.  Cleaned and bacitracin applied. FAMILY:  wife spent the night at the bedside. A: s/p head trauma with fx skull, intact neuro status P: clear cervical neck, back ASAP ?? to OR today for repair of sinus fractures ?? change to pca pump for improved pain control. 

ESTING DONE, BRAIN DEATH CRITERIA MET. FAMILY MEETING WITH DR. [**Last Name (STitle) 396**]. FAMILY WANTED TO EXTUBATE PT AND ALLOW TO PASS AWAY. PT EXTUBATED AT 1157 BY DR. [**Last Name (STitle) 396**], PRONOUNCED AT 1229. FAMILY PRESENT. POST-MORTEM CARE GIVEN. 

f 12mg IV haldol from both EW and TSICU). Baseline Qtc fr

In [19]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

print(np.mean(semi_synthetic_data['age']))
print(np.std(semi_synthetic_data['age']))

C = np.random.normal(76.4, 56.8, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.8*(semi_synthetic_data['age'] - 67)), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.5*semi_synthetic_data['age']

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

76.41380598281891
56.78193469093504
0.5110183015856847


In [20]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print(cohens_kappa(semi_synthetic_data['W'], semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['age', 'gender'], semi_synthetic_data))
print(odds_ratio('W', 'Z', [], semi_synthetic_data))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

7.504150688720684
13.701862901506212

0.18647923669824454
0.0050873139746626305

2.836124844592412
3.09471127245867

1.5552514148538126
1.5131245220417733


In [21]:
print(backdoor_adjustment('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data, 'backdoor'))

1.2962545868000959
(1.27076719689324, 1.3177239594871473, [1.303260734219407, 1.309604069540768, 1.2780711438352768, 1.2979583473266558, 1.2882510409711827, 1.271225255790334, 1.2708251516803912, 1.306180963542019, 1.2845434812739924, 1.2708154051172613, 1.291173478671439, 1.2793151604946615, 1.2910306460994434, 1.2955214749227153, 1.3302933762108182, 1.2817550426949964, 1.302633819338567, 1.2968664259579796, 1.3175650222166055, 1.2711806695446484, 1.3035664501730153, 1.278564976721448, 1.3056258275075692, 1.311380359967444, 1.2803190025909004, 1.3079182325531988, 1.3022752911452287, 1.2942208834049467, 1.3107954410106402, 1.285699171552558, 1.3075447860222411, 1.3024209116453846, 1.2878409692154875, 1.2828061437816203, 1.279372065541132, 1.3009433830609254, 1.290113372104166, 1.2938417058032243, 1.3059122174119508, 1.2987317290245244, 1.2891639529494299, 1.2796577958454378, 1.2986979198233826, 1.2984777707064126, 1.2846291791685474, 1.3175799179786623, 1.2750347183065571, 1.2962963390

### Use proximal with two proxies that are just random chance.

In [10]:
np.random.seed(2)

random_classifier1 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

random_classifier2 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

semi_synthetic_data['R1'] = random_classifier1
semi_synthetic_data['R2'] = random_classifier2

ace = proximal_find_ace('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/heart_baseline.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use backdoor with Flan-T5 as the predictions.

In [8]:
ace = backdoor_adjustment('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals_backdoor('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data, 'backdoor')

with open('pickle_files/heart_backdoor.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 and Lexicon predictions.

In [22]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data)
print(ace)
print(conf_int)

1.267184362667072
(1.164310122504327, 1.3919867494676235)


In [None]:
with open('pickle_files/heart_proximal.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)