In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import pandas as pd
import numpy as np
import re

from bag_of_words import *
from regex_predictor import *
from odds_ratio import *
from proximal import *

In [2]:
# load the afib diagnosis and patient notes data
afib_data = pd.read_csv('csv_files/processed_afib_data.csv')
print(afib_data)
print(np.mean(afib_data['afib']))

       patient_id  afib                                              notes  \
0             109     0  PATIENT/TEST INFORMATION: Indication: Code. As...   
1             113     0  Sinus rhythm, rate 93. Non-specific ST-T wave ...   
2             114     0  Normal sinus rhythm, rate 96 Right bundle bran...   
3             115     0  PATIENT/TEST INFORMATION: Indication: Left ven...   
4             117     0  PATIENT/TEST INFORMATION: Indication: Murmur. ...   
...           ...   ...                                                ...   
29446       97164     0  PATIENT/TEST INFORMATION: Indication: Aortic v...   
29447       97484     0  Sinus bradycardia with non-diagnostic repolari...   
29448       97488     1  PATIENT/TEST INFORMATION: Indication: Stroke  ...   
29449       97492     0  PATIENT/TEST INFORMATION: Indication: Cerebrov...   
29450       97497     1  PATIENT/TEST INFORMATION: Indication: Left ven...   

       age                                        notes_half1  

### Bag of Words Linear Logistic Regression

Predicting atrial fibrillation using only first half of the notes.

In [4]:
# use a simple bag of words linear logistic regression to predict if a patient suffered from afib
text_clf = create_bow_model()

text_clf.fit(afib_data['notes'], afib_data['afib'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half1'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.804692540151438


In [6]:
create_confusion_matrix(afib_data['afib'], predicted)

<function bag_of_words.create_confusion_matrix(truth, predictions)>

Predicting atrial fibrillation using only the second half of the notes.

In [19]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half2'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.749889647210621


In [20]:
create_confusion_matrix(afib_data['afib'], predicted)

19743 1505 5861 2342
sensitivity:  0.2855053029379495
specificity:  0.9291698042168675
precision:  0.6087860670652456
recall:  0.2855053029379495


### Now train a BoW model that only uses the first 5,000 most important words in the vocabulary.

Predicting atrial fibrillation using only the first half of the note.

In [3]:
# use a simple bag of words linear logistic regression to predict if a patient suffered from afib, but only considering the 5000 most
# frequently occurring words and also considering both unigrams and bigrams
text_clf = create_bow_model(max_words=5000, ngram_range=(1,2))

text_clf.fit(afib_data['notes'], afib_data['afib'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.8383756069403416


In [11]:
create_confusion_matrix(afib_data['afib'], predicted)

{'tn': 19722,
 'fp': 1526,
 'fn': 3234,
 'tp': 4969,
 'sensitivity': 0.6057539924417896,
 'specificity': 0.9281814759036144,
 'precision': 0.765050038491147,
 'recall': 0.6057539924417896}

In [6]:
# make predictions from the original dataset
predicted_llr = text_clf.predict(afib_data['notes_half1'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted_llr))

0.804454857220468


In [26]:
create_confusion_matrix(afib_data['data'], predicted_llr)

19794 1454 4305 3898
sensitivity:  0.47519200292575886
specificity:  0.9315700301204819
precision:  0.7283258594917787
recall:  0.47519200292575886


In [13]:
coef = text_clf['lr'].coef_[0]
print(coef)

words = text_clf['vect'].get_feature_names_out()
print(words)

sorted({w:c for w, c in zip(words, coef)}.items(), key=lambda kv: -kv[1])

[ 2.40863331  2.19556888  0.07300466 ...  1.03502702 -1.26516494
  1.78047892]
['00' '00 m2' '00 test' ... 'yellow urine' 'yet' 'yo']


[('indication atrial', 34.06025064396608),
 ('fibrillation left', 17.715982607199074),
 ('fibrillation height', 16.909335784429054),
 ('fibrillation conclusions', 13.474307777710788),
 ('iabp', 11.297833562004442),
 ('fibrillation', 11.114851991408095),
 ('atrial fibrillation', 10.351406483101856),
 ('fibrillation flutter', 9.950527523063501),
 ('time 2171', 9.572353563332705),
 ('septum markedly', 9.34009003013923),
 ('function weight', 9.116713872956641),
 ('on vent', 8.872017300454564),
 ('40 test', 8.277070786563131),
 ('aneurysmal', 7.709572565843064),
 ('pacing', 7.583627536702008),
 ('be atrial', 7.539055759291496),
 ('21 test', 7.4777534881245815),
 ('distal septal', 7.397728957473057),
 ('output no', 7.235255670615974),
 ('severe tricuspid', 7.0506561321454875),
 ('hypertension no', 7.029467125259486),
 ('flutter', 6.888479276315469),
 ('prolapse there', 6.64223832046163),
 ('left and', 6.620432702131418),
 ('comments the', 6.609494485108033),
 ('septum moderately', 6.58102352

Predicting atrial fibrillation using only the second half of the note.

In [27]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half2'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7762384978438762


In [28]:
create_confusion_matrix(afib_data['data'], predicted)

20192 1056 5534 2669
sensitivity:  0.32536876752407656
specificity:  0.9503012048192772
precision:  0.7165100671140939
recall:  0.32536876752407656


### Predictions from Regular Expressions

Make predictions based on the entire note.

In [15]:
# make predictions using the regular expression method
predicted = regular_expression_predict(afib_data['notes'], ['atrial fib', 'a-*fib'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7927065294896608


In [16]:
create_confusion_matrix(afib_data['afib'], predicted)

{'tn': 20968,
 'fp': 280,
 'fn': 5825,
 'tp': 2378,
 'sensitivity': 0.28989394124100937,
 'specificity': 0.9868222891566265,
 'precision': 0.8946576373212942,
 'recall': 0.28989394124100937}

Make predictions based on only the first half of the note.

In [17]:
# make predictions using the regular expression method
predicted = regular_expression_predict(afib_data['notes_half1'], ['atrial fib', 'a-*fib'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7805167906013378


In [18]:
create_confusion_matrix(afib_data['afib'], predicted)

{'tn': 21021,
 'fp': 227,
 'fn': 6237,
 'tp': 1966,
 'sensitivity': 0.23966841399487993,
 'specificity': 0.9893166415662651,
 'precision': 0.8964888280893752,
 'recall': 0.23966841399487993}

Make predictions based on only the second half of the note.

In [8]:
# make predictions using the regular expression method
predicted_regex = regular_expression_predict(afib_data['notes_half2'], ['atrial fib', 'a-*fib'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted_regex))

# predictions = pd.DataFrame({'prediction': predicted})
# predictions.to_csv('csv_files/predictions-regex.csv', index=False)

0.7449662150690978


In [20]:
create_confusion_matrix(predicted_regex, afib_data['afib'])

{'tn': 21146,
 'fp': 7409,
 'fn': 102,
 'tp': 794,
 'sensitivity': 0.8861607142857143,
 'specificity': 0.7405358080896516,
 'precision': 0.09679385590637572,
 'recall': 0.8861607142857143}

In [9]:
# make predictions using the regular expression method
predicted_regex_2 = regular_expression_predict(afib_data['notes_half2'], ['aorta'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted_regex_2))

0.6705375029710366


In [10]:
afib_data['predicted_llr'] = predicted_llr
afib_data['predicted_regex'] = predicted_regex
afib_data['predicted_regex_2'] = predicted_regex_2
print(np.mean(afib_data['predicted_regex'] == afib_data['predicted_regex_2']))

print(odds_ratio('predicted_regex_2', 'predicted_regex', ['afib', 'age', 'gender'], afib_data))

0.7524022953380191
1.010147405978035
