### Code creating a synthetic DGP using congestive heart failure and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *

In [124]:
master_data = pd.read_csv('csv_files/master_data.csv')
zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-congestiveheartfailure-sentence.csv')
# ** lung, leg, feet
# family has most accurate and least variance predictions so far
regex_preds = regular_expression_predict(master_data['notes_half2'], ['family'])

print(np.mean(master_data['heart_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['heart_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['heart_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['heart_fail'], regex_preds))

0.5999117177684968
0.5498285287426573
{'tn': 11574, 'fp': 10099, 'fn': 1684, 'tp': 6094, 'sensitivity': 0.783491900231422, 'specificity': 0.5340285147418447, 'precision': 0.37633545359105786, 'recall': 0.783491900231422}

0.7174289497809921
0.019490000339547044
{'tn': 21114, 'fp': 559, 'fn': 7763, 'tp': 15, 'sensitivity': 0.0019285163281049112, 'specificity': 0.974207539334656, 'precision': 0.02613240418118467, 'recall': 0.0019285163281049112}


In [125]:
semi_synthetic_data = pd.DataFrame({'U': master_data['heart_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender']})

In [126]:
# generate A and Z as a function of U
np.random.seed(3)

size = len(semi_synthetic_data)

C = np.random.normal(0, 1, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U']+C), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + C

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

0.5373331975145156


In [127]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

4.147300160712981
0.07298277666244009

0.4485416454449764

0.9351932275373761
0.9575241819628402


In [119]:
print(backdoor_adjustment('Y', 'A', ['U', 'C'], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', ['U', 'C'], semi_synthetic_data, 'backdoor'))

1.3286920600514924
(1.305431040873405, 1.3542336469444622)


In [128]:
print(proximal_find_ace('A', 'Y', 'W', 'Z', ['C'], semi_synthetic_data))
print(compute_confidence_intervals("A", "Y", "W", "Z", ['C'], semi_synthetic_data))

1.3445608163970555
(1.192421821500404, 1.425253643270937)
