### Code creating a synthetic DGP using acute kidney failure and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit
import pickle

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *
from cohens_kappa import *

In [2]:
master_data = pd.read_csv('csv_files/master_data.csv')

zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-acutekidneyfailure-document.csv')

# rhythm, not correlated enough with kidney failure
# rhythm, flat, abscess, abcess
# abscess, abcess
regex_preds = regular_expression_predict(master_data['notes_half2'], ['liver'])

print(np.mean(master_data['kidney_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['kidney_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['kidney_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['kidney_fail'], regex_preds))

0.7952870870259074
0.0014940069946691114
{'tn': 23408, 'fp': 30, 'fn': 5999, 'tp': 14, 'sensitivity': 0.002328288707799767, 'specificity': 0.9987200273060841, 'precision': 0.3181818181818182, 'recall': 0.002328288707799767}

0.7949135852772402
0.0013242334725476214
{'tn': 23405, 'fp': 33, 'fn': 6007, 'tp': 6, 'sensitivity': 0.0009978380176284716, 'specificity': 0.9985920300366925, 'precision': 0.15384615384615385, 'recall': 0.0009978380176284716}


In [3]:
semi_synthetic_data = pd.DataFrame({'U': master_data['kidney_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender']})

In [4]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print(cohens_kappa(semi_synthetic_data['W'], semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

1.820925709840825
0.7084159389800483

0.9971817595327833
-0.0014059805792845281

4.057584218944586e-08
3.022969494491038e-08


In [5]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

print(np.mean(semi_synthetic_data['age']))
print(np.std(semi_synthetic_data['age']))

C = np.random.normal(76.4, 56.8, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.8*(semi_synthetic_data['age'] - 67)), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.5*semi_synthetic_data['age']

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

76.41380598281891
56.78193469093504
0.5104071169060473


### Use proximal with two proxies that are just random chance.

In [6]:
np.random.seed(2)

random_classifier1 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

random_classifier2 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

semi_synthetic_data['R1'] = random_classifier1
semi_synthetic_data['R2'] = random_classifier2

ace = proximal_find_ace('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/kidney_baseline.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use backdoor with Flan-T5 as predictions

In [7]:
ace = backdoor_adjustment('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals_backdoor('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data, 'backdoor')

with open('pickle_files/kidney_backdoor.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 and Lexicon predictions.

In [8]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/kidney_proximal.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)