### Code creating a synthetic DGP using afib and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit
from matplotlib import pyplot as plt
import pickle

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *
from bag_of_words import *
from cohens_kappa import *

In [2]:
afib_data = pd.read_csv('csv_files/master_data.csv')
zero_shot_preds = pd.read_csv('csv_files/predictions-xxl.csv')
# rhythm, aorta, ventricle
regex_preds1 = regular_expression_predict(afib_data['notes_half2'], ['atrial'])

# regex_preds = pd.read_csv('csv_files/predictions-regex.csv')
# regex_preds2 = regular_expression_predict(afib_data['notes_half2'], ['atrial fib', 'a-*fib'])

print(np.mean(afib_data['afib'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print(create_confusion_matrix(afib_data['afib'], zero_shot_preds['prediction']))
print(cohens_kappa(afib_data['afib'], zero_shot_preds['prediction']))
print()

print(np.mean(afib_data['afib'] == regex_preds1))
print(np.mean(regex_preds1))
print(create_confusion_matrix(afib_data['afib'], regex_preds1))
print(cohens_kappa(afib_data['afib'], regex_preds1))

print()
print(cohens_kappa(regex_preds1, zero_shot_preds['prediction']))

0.7661199959254354
0.06183151675664663
{'tn': 20995, 'fp': 253, 'fn': 6635, 'tp': 1568, 'sensitivity': 0.19114957942216262, 'specificity': 0.9880929969879518, 'precision': 0.8610653487095002, 'recall': 0.19114957942216262}
0.23548148507098576

0.6832365624257241
0.26056840175206275
{'tn': 16848, 'fp': 4400, 'fn': 4929, 'tp': 3274, 'sensitivity': 0.399122272339388, 'specificity': 0.7929216867469879, 'precision': 0.42663539223351576, 'recall': 0.399122272339388}
0.19592245378299625

0.0466896114541759


In [3]:
# semi_synthetic_data = pd.DataFrame({'U': afib_data['afib'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds['prediction'], 'notes_half1': afib_data['notes_half1'],
#                                    'age': afib_data['age'], 'gender': afib_data['gender']})
semi_synthetic_data = pd.DataFrame({'U': afib_data['afib'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds1,
                                    'age': afib_data['age'], 'gender': afib_data['gender']})
len(semi_synthetic_data)

29451

### Find 10 false positives and 10 false negatives from the zero-shot predictions.

In [4]:
# find 10 false positives and 10 false negatives
false_positive = []
false_negative = []

# false positive is defined by element in W being 1 but element in U is 0
# false negative is defined by element in W being 0 but element in U is 1
for index, row in semi_synthetic_data.iterrows():
    if row['U'] == 0 and row['W'] == 1 and len(false_positive) < 10:
        false_positive.append(row['notes_half1'])
    elif row['U'] == 1 and row['W'] == 0 and len(false_negative) < 10:
        false_negative.append(row['notes_half1'])

    if len(false_positive) == 10 and len(false_negative) == 10:
        break

analysis = pd.DataFrame({'false_positive': false_positive, 'false_negative': false_negative})
analysis.to_csv('csv_files/predictions_analysis.csv', index=False)

In [5]:
print('false positives:')
for sentence in false_positive:
    print(sentence)
    print()

print()

print('false negatives:')
for sentence in false_negative:
    print(sentence)
    print()

false positives:
PATIENT/TEST INFORMATION: Indication: Murmur. Height: (in) 64 Weight (lb): 278 BSA (m2): 2.25 m2 BP (mm Hg): 109/53 HR (bpm): 68 Status: Inpatient Date/Time: [**2133-11-20**] at 14:16 Test: Portable TTE (Complete) Doppler: Full doppler and color doppler Contrast: None Technical Quality: Adequate   INTERPRETATION:  Findings:  LEFT ATRIUM: Mild LA enlargement.  RIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size. A catheter or pacing wire is seen in the RA and/or RV.  LEFT VENTRICLE: Normal LV wall thicknesses and cavity size. Mild regional LV systolic dysfunction.  LV WALL MOTION: Regional LV wall motion abnormalities include: basal inferoseptal - hypo; basal inferior - hypo; mid inferior - hypo;  RIGHT VENTRICLE: Normal RV chamber size and free wall motion.  AORTA: Normal aortic root diameter.  AORTIC VALVE: Mildly thickened aortic valve leaflets.  MITRAL VALVE: Mildly thickened mitral valve leaflets. Mild thickening of mitral valve chordae. Mild to moderate ([**1-30**]+) 

### Synthetic DGP starts here.

In [4]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

print(np.mean(semi_synthetic_data['age']))
print(np.std(semi_synthetic_data['age']))

C = np.random.normal(76.4, 56.8, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.8*(semi_synthetic_data['age'] - 67)), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + 0.8*semi_synthetic_data['gender'] + 0.5*semi_synthetic_data['age']

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

76.41380598281891
56.78193469093504
0.5121388068316866


In [7]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print(cohens_kappa(semi_synthetic_data['W'], semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['age', 'gender'], semi_synthetic_data))
print(odds_ratio('W', 'Z', [], semi_synthetic_data))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

19.611033833634597
2.5434050794001957

0.7233710230552443
0.0466896114541759

1.7189096474305383
1.731799725993281

0.9878524260838915
1.0166407944379683


In [8]:
print(backdoor_adjustment('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', ['U', 'gender', 'age'], semi_synthetic_data, 'backdoor'))

1.2908561122132767
(1.2619049658272838, 1.3140094593137908, [1.2898160610155713, 1.3070971509143234, 1.2760356722822053, 1.2969014058025081, 1.2898870000599914, 1.2619185236445176, 1.261376210955163, 1.2935437380669015, 1.277046570485581, 1.2728197185804504, 1.2897100432461812, 1.283228819355024, 1.2881935315055273, 1.2887013776327052, 1.3254043809661766, 1.2766472984074184, 1.3012550030210406, 1.2916895708891545, 1.3138176246275464, 1.2668791204091718, 1.3086104156959948, 1.2793800986685042, 1.3060046770328384, 1.307098454287356, 1.2733187095636893, 1.2971598961115234, 1.294419415077975, 1.282164430690429, 1.3024996811306053, 1.2848092848895973, 1.3071186743765892, 1.2845245567085826, 1.2767986723660698, 1.2781926384816842, 1.2818735726354475, 1.2950361961503987, 1.2830638271876964, 1.2914764850694738, 1.3084875886073064, 1.2830698530266105, 1.2844177750808115, 1.2744743599245325, 1.2894065021392294, 1.2878264894928364, 1.2793739888314235, 1.3124229783031112, 1.2631214875358125, 1.282

### Use proximal with two proxies that are just random chance.

In [8]:
np.random.seed(2)

random_classifier1 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

random_classifier2 = np.random.binomial(1, 0.5, len(semi_synthetic_data))

semi_synthetic_data['R1'] = random_classifier1
semi_synthetic_data['R2'] = random_classifier2

ace = proximal_find_ace('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'R1', 'R2', ['age', 'gender'], semi_synthetic_data)

with open('pickle_files/afib_baseline.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use backdoor with Flan-T5 as the predictions.

In [9]:
ace = backdoor_adjustment('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals_backdoor('Y', 'A', ['W', 'age', 'gender'], semi_synthetic_data, 'backdoor')

with open('pickle_files/afib_backdoor.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

### Use proximal with Flan-T5 and Lexicon predictions.

In [11]:
ace = proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
conf_int = compute_confidence_intervals('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data)
print(ace)
print(conf_int)

1.3261855439763253
(1.214280675397115, 1.416181220823031)


In [None]:
with open('pickle_files/afib_proximal.pkl', 'wb') as file:
    pickle.dump((ace, conf_int[0], conf_int[1]), file)

In [24]:
print(backdoor_adjustment('Y', 'A', [], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', [], semi_synthetic_data, 'backdoor'))

1.540988774116704


(1.5152403034202575, 1.5640290262683836)


In [10]:
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))

0.5309836677871719


In [32]:
np.random.seed(0)

size = 50000

N = np.random.binomial(1, 0.5, size)
L = np.random.binomial(1, expit(2*N), size)
P = np.random.binomial(1, expit(1.5*N), size)

test_df = pd.DataFrame({'N': N, 'L': L, 'P': P})

print(odds_ratio('L', 'P', ['N'], test_df))

0.9859719562743827
0.0
0.5318568415892917
