### Code creating a synthetic DGP using afib and prediction data from zero-shot classifier and regex matching.

In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import statsmodels.api as sm
import scipy.stats as stats
from adjustment import *
from fcit import fcit

from proximal import *
from regex_predictor import *
from odds_ratio import *
from backdoor import *

In [5]:
afib_data = pd.read_csv('csv_files/master_data.csv')
zero_shot_preds = pd.read_csv('csv_files/predictions-xxl.csv')
# rhythm, aorta, ventricle
regex_preds1 = regular_expression_predict(afib_data['notes_half2'], ['atrial'])

regex_preds = pd.read_csv('csv_files/predictions-regex.csv')
regex_preds2 = regular_expression_predict(afib_data['notes_half2'], ['atrial fib', 'a-*fib'])

print(np.mean(afib_data['afib'] == zero_shot_preds['prediction']))
print(np.mean(zero_shot_preds['prediction']))
print()

print(np.mean(afib_data['afib'] == regex_preds1))
print(np.mean(regex_preds1))

FileNotFoundError: [Errno 2] No such file or directory: 'csv_files/predictions-xxl.csv'

In [8]:
# semi_synthetic_data = pd.DataFrame({'U': afib_data['afib'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds['prediction'], 'notes_half1': afib_data['notes_half1'],
#                                    'age': afib_data['age'], 'gender': afib_data['gender']})
semi_synthetic_data = pd.DataFrame({'U': afib_data['afib'], 'W': zero_shot_preds['prediction'], 'Z': regex_preds1,
                                    'age': afib_data['age'], 'gender': afib_data['gender']})

### Find 10 false positives and 10 false negatives from the zero-shot predictions.

In [4]:
# find 10 false positives and 10 false negatives
false_positive = []
false_negative = []

# false positive is defined by element in W being 1 but element in U is 0
# false negative is defined by element in W being 0 but element in U is 1
for index, row in semi_synthetic_data.iterrows():
    if row['U'] == 0 and row['W'] == 1 and len(false_positive) < 10:
        false_positive.append(row['notes_half1'])
    elif row['U'] == 1 and row['W'] == 0 and len(false_negative) < 10:
        false_negative.append(row['notes_half1'])

    if len(false_positive) == 10 and len(false_negative) == 10:
        break

analysis = pd.DataFrame({'false_positive': false_positive, 'false_negative': false_negative})
analysis.to_csv('csv_files/predictions_analysis.csv', index=False)

In [5]:
print('false positives:')
for sentence in false_positive:
    print(sentence)
    print()

print()

print('false negatives:')
for sentence in false_negative:
    print(sentence)
    print()

false positives:
PATIENT/TEST INFORMATION: Indication: Murmur. Height: (in) 64 Weight (lb): 278 BSA (m2): 2.25 m2 BP (mm Hg): 109/53 HR (bpm): 68 Status: Inpatient Date/Time: [**2133-11-20**] at 14:16 Test: Portable TTE (Complete) Doppler: Full doppler and color doppler Contrast: None Technical Quality: Adequate   INTERPRETATION:  Findings:  LEFT ATRIUM: Mild LA enlargement.  RIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size. A catheter or pacing wire is seen in the RA and/or RV.  LEFT VENTRICLE: Normal LV wall thicknesses and cavity size. Mild regional LV systolic dysfunction.  LV WALL MOTION: Regional LV wall motion abnormalities include: basal inferoseptal - hypo; basal inferior - hypo; mid inferior - hypo;  RIGHT VENTRICLE: Normal RV chamber size and free wall motion.  AORTA: Normal aortic root diameter.  AORTIC VALVE: Mildly thickened aortic valve leaflets.  MITRAL VALVE: Mildly thickened mitral valve leaflets. Mild thickening of mitral valve chordae. Mild to moderate ([**1-30**]+) 

### Synthetic DGP starts here.

In [12]:
# generate A and Z as a function of U
np.random.seed(1)

size = len(semi_synthetic_data)

C = np.random.normal(0, 1, size)

A = np.random.binomial(1, expit(0.8*semi_synthetic_data['U']+C), size)

print(np.mean(A))

Y = np.random.normal(0, 1, size) + 1.3*A + 1.4*semi_synthetic_data['U'] + C

semi_synthetic_data['A'] = A
semi_synthetic_data['Y'] = Y
semi_synthetic_data['C'] = C

0.546263284778106


In [13]:
print(odds_ratio('U', 'W', [], semi_synthetic_data))
print(odds_ratio('U', 'Z', [], semi_synthetic_data))

print()
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))
print()

print(odds_ratio('W', 'Z', ['U'], semi_synthetic_data))
print(odds_ratio('W', 'Z', ['U', 'age', 'gender'], semi_synthetic_data))

19.611033833634597
2.5434050794001957

0.7233710230552443

0.9878524260838915
1.0166407944379683


In [14]:
print(backdoor_adjustment('Y', 'A', ['U', 'C'], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', ['U', 'C'], semi_synthetic_data, 'backdoor'))

1.2876868354257465
(1.2658983503816448, 1.3084108578605342)


In [15]:
print(proximal_find_ace('A', 'Y', 'W', 'Z', ['C'], semi_synthetic_data))
print(compute_confidence_intervals("A", "Y", "W", "Z", ['C'], semi_synthetic_data))

1.3059454195741502
(1.2300821583243922, 1.3658271766111074)


In [36]:
print(proximal_find_ace('A', 'Y', 'W', 'Z', ['age', 'gender'], semi_synthetic_data))
print(compute_confidence_intervals("A", "Y", "W", "Z", ['age', 'gender'], semi_synthetic_data))

1.179830478620565
(1.038854003479516, 1.2786023058854925)


In [24]:
print(backdoor_adjustment('Y', 'A', [], semi_synthetic_data))
print(compute_confidence_intervals_backdoor('Y', 'A', [], semi_synthetic_data, 'backdoor'))

1.540988774116704


(1.5152403034202575, 1.5640290262683836)


In [10]:
print(np.mean(semi_synthetic_data['W'] == semi_synthetic_data['Z']))

0.5309836677871719


In [32]:
np.random.seed(0)

size = 50000

N = np.random.binomial(1, 0.5, size)
L = np.random.binomial(1, expit(2*N), size)
P = np.random.binomial(1, expit(1.5*N), size)

test_df = pd.DataFrame({'N': N, 'L': L, 'P': P})

print(odds_ratio('L', 'P', ['N'], test_df))

0.9859719562743827
0.0
0.5318568415892917
