### Code creating a semi-synthetic DGP using acute kidney failure and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit
import pickle

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *

In [None]:
master_data = pd.read_csv('csv_files/master_data.csv')

zero_shot_preds = pd.read_csv('csv_files/predictions-xxl-acutekidneyfailure-document.csv')
# note: we flip the predictions to get an odds ratio greater than 1
# since the prediction is a binary variable, flipping the 1s to 0s and vice versa does not affect downstream
# estimation tasks
regex_preds = regular_expression_predict(master_data['notes_half2'], ['liver'])
for i in range(len(regex_preds)):
    if regex_preds[i] == 0:
        regex_preds[i] = 1
    elif regex_preds[i] == 1:
        regex_preds[i] = 0

print(np.mean(master_data['kidney_fail'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(master_data['kidney_fail'], zero_shot_preds['prediction']))
print()

print(np.mean(master_data['kidney_fail'] == regex_preds))
print(np.mean(regex_preds))
print(create_confusion_matrix(master_data['kidney_fail'], regex_preds))

Experiment 1: Flan-T5 and Regex Predictor

In [6]:
semi_synthetic_data = pd.DataFrame({'U': master_data['kidney_fail'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds,
                                    'age': master_data['age'], 'gender': master_data['gender']})

Experiment 2: Flan-T5 for both portions of text data.

In [None]:
zero_shot_preds_texthalf2 = pd.read_csv('csv_files/predictions-xxl-acutekidneyfailure-document-texthalf2.csv')

print(np.mean(master_data['kidney_fail'] == zero_shot_preds_texthalf2['prediction']))
print(np.mean(zero_shot_preds_texthalf2['prediction']))
print(create_confusion_matrix(master_data['kidney_fail'], zero_shot_preds_texthalf2['prediction']))
print()

semi_synthetic_data = pd.DataFrame({'U': master_data['kidney_fail'], 'W': zero_shot_preds['prediction'], 'Z': zero_shot_preds_texthalf2['prediction'],
                                    'age': master_data['age'], 'gender': master_data['gender']})
len(semi_synthetic_data)

In [None]:
print(odds_ratio('U', 'W', ['age', 'gender'], semi_synthetic_data))
print(odds_ratio('U', 'Z', ['age', 'gender'], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['age', 'gender'], semi_synthetic_data))
print(odds_ratio('W', 'Z', [], semi_synthetic_data))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

### Synthetic DGP starts here.

In [None]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

print(np.mean(semi_synthetic_data['age']))
print(np.std(semi_synthetic_data['age']))

C = np.random.normal(76.4, 56.8, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.8*(semi_synthetic_data['age'] - 67)), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.5*semi_synthetic_data['age']

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

### Use proximal with two proxies that are just random chance.

In [6]:
np.random.seed(2)

random_classifier1 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

random_classifier2 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

semi_synthetic_data['R1'] = random_classifier1
semi_synthetic_data['R2'] = random_classifier2

ace = proximal_find_ace('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/kidney_baseline.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use backdoor with Flan-T5 as predictions

In [7]:
ace = backdoor_adjustment('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals_backdoor('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data, 'backdoor')

with open('pickle_files/kidney_backdoor.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 and Lexicon predictions.

In [None]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data)
print(ace)
print(conf_int)

In [None]:
with open('pickle_files/kidney_proximal.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 predictions for both W and Z.

Note: Run the appropriate dataset initialization (experiment 2) above.

In [None]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data)
print(ace)
print(conf_int)

In [13]:
with open('pickle_files/kidney_proximal_both_proxies_flant5.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)