### Code creating a synthetic DGP using acute kidney failure and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *

In [49]:
master_data = pd.read_csv('csv_files/master_data.csv')

zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-acutekidneyfailure-sentence.csv')

# rhythm, not correlated enough with kidney failure
# rhythm, flat, abscess, abcess
# abscess, abcess
regex_preds = regular_expression_predict(master_data['notes_half2'], ['absces', 'abces'])

print(np.mean(master_data['kidney_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['kidney_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['kidney_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['kidney_fail'], regex_preds))

0.7777664595429696
0.048623136735594714
{'tn': 22456, 'fp': 982, 'fn': 5563, 'tp': 450, 'sensitivity': 0.07483785132213537, 'specificity': 0.9581022271524874, 'precision': 0.31424581005586594, 'recall': 0.07483785132213537}

0.7940647176666327
0.00909986078571186
{'tn': 23278, 'fp': 160, 'fn': 5905, 'tp': 108, 'sensitivity': 0.01796108431731249, 'specificity': 0.993173478965782, 'precision': 0.40298507462686567, 'recall': 0.01796108431731249}


In [50]:
semi_synthetic_data = pd.DataFrame({'U': master_data['kidney_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender']})

In [51]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

1.849798256081056
2.6609060118536405

0.943363553020271

1.0890150753080436
1.1796050454689975


In [74]:
# 9
# generate A and Z as a function of U
np.random.seed(9)

size = len(semi_synthetic_data)

C = np.random.normal(0, 1, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U']+C), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + C

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

0.5341414552986317


In [75]:
print(proximal_find_ace('A', 'Y', 'W', 'Z', ['C'], semi_synthetic_data))
print(compute_confidence_intervals("A", "Y", "W", "Z", ['C'], semi_synthetic_data))

1.319135521760253
(0.6837850403613688, 2.3814183032698906)
