In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import re

In [10]:
# load the afib diagnosis and patient notes data
afib_data = pd.read_csv('processed_afib_data.csv')
print(afib_data)
print(np.mean(afib_data['afib']))

       index  patient_id  afib  \
0          0         109     0   
1          2         113     0   
2          3         114     0   
3          4         115     0   
4          6         117     0   
...      ...         ...   ...   
29446  46515       97164     0   
29447  46516       97484     0   
29448  46517       97488     1   
29449  46518       97492     0   
29450  46519       97497     1   

                                                   notes  age  \
0      PATIENT/TEST INFORMATION: Indication: Code. As...   25   
1      Sinus rhythm, rate 93. Non-specific ST-T wave ...   35   
2      Normal sinus rhythm, rate 96 Right bundle bran...   48   
3      PATIENT/TEST INFORMATION: Indication: Left ven...   75   
4      PATIENT/TEST INFORMATION: Indication: Murmur. ...   50   
...                                                  ...  ...   
29446  PATIENT/TEST INFORMATION: Indication: Aortic v...   83   
29447  Sinus bradycardia with non-diagnostic repolari...   79   
29448 

### Bag of Words Linear Logistic Regression

Predicting atrial fibrillation using only first half of the notes.

In [11]:
# use a simple bag of words linear logistic regression to predict if a patient suffered from afib
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(C=1e20)),
])

text_clf.fit(afib_data['notes'], afib_data['afib'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half1'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7676479576245289


In [17]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

20123 1125 5718 2485
sensitivity:  0.3029379495306595
specificity:  0.9470538403614458
precision:  0.6883656509695291
recall:  0.3029379495306595


Predicting atrial fibrillation using only the second half of the notes.

In [18]:
# use a simple bag of words linear logistic regression to predict if a patient suffered from afib
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(C=1e20)),
])

text_clf.fit(afib_data['notes'], afib_data['afib'])

In [19]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half2'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.749889647210621


In [20]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

19743 1505 5861 2342
sensitivity:  0.2855053029379495
specificity:  0.9291698042168675
precision:  0.6087860670652456
recall:  0.2855053029379495


### Now train a BoW model that only uses the first 5,000 most important words in the vocabulary.

Predicting atrial fibrillation using only the first half of the note.

In [None]:
count_vect = CountVectorizer(max_features=5000, ngram_range=(1, 2))

word_count = count_vect.fit_transform(afib_data['notes'])

vocab = str(count_vect.vocabulary_)
vocab = vocab.replace(',', '\n')

print(vocab)

In [22]:
# use a simple bag of words linear logistic regression to predict if a patient suffered from afib, but only considering the 5000 most
# frequently occurring words and also considering both unigrams and bigrams
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(C=1e20)),
])

text_clf.fit(afib_data['notes'], afib_data['afib'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.8383756069403416


In [24]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

19722 1526 3234 4969
sensitivity:  0.6057539924417896
specificity:  0.9281814759036144
precision:  0.765050038491147
recall:  0.6057539924417896


In [25]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half1'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.804454857220468


In [26]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

19794 1454 4305 3898
sensitivity:  0.47519200292575886
specificity:  0.9315700301204819
precision:  0.7283258594917787
recall:  0.47519200292575886


Predicting atrial fibrillation using only the second half of the note.

In [27]:
# make predictions from the original dataset
predicted = text_clf.predict(afib_data['notes_half2'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7762384978438762


In [28]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

20192 1056 5534 2669
sensitivity:  0.32536876752407656
specificity:  0.9503012048192772
precision:  0.7165100671140939
recall:  0.32536876752407656


### Predictions from Regular Expressions

In [29]:
def regular_expression_predict(notes):
    prediction = []

    for note in notes:
        note = note.lower()

        # check if the phrases 'atrial fib' or 'afib' show up, if neither phrase shows up then
        # we make a prediction that the patient did not experience atrial fibrillation, otherwise
        # we predict that the patient did experience atrial fibrillation
        if re.search('atrial fib', note, flags=0) == None and re.search('a-*fib', note, flags=0) == None:
            prediction.append(0)
        else:
            prediction.append(1)

    return prediction

Make predictions based on the entire note.

In [35]:
# make predictions using the regular expression method
predicted = regular_expression_predict(afib_data['notes'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7927065294896608


In [36]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

20968 280 5825 2378
sensitivity:  0.28989394124100937
specificity:  0.9868222891566265
precision:  0.8946576373212942
recall:  0.28989394124100937


Make predictions based on only the first half of the note.

In [31]:
# make predictions using the regular expression method
predicted = regular_expression_predict(afib_data['notes_half1'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7805167906013378


In [32]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

21021 227 6237 1966
sensitivity:  0.23966841399487993
specificity:  0.9893166415662651
precision:  0.8964888280893752
recall:  0.23966841399487993


Make predictions based on only the second half of the note.

In [33]:
# make predictions using the regular expression method
predicted = regular_expression_predict(afib_data['notes_half2'])

# measure the accuracy of the predictions
print(np.mean(afib_data['afib'] == predicted))

0.7449662150690978


In [34]:
# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(afib_data['afib'], predicted).ravel()
# print the confusion matrix
print(tn, fp, fn, tp)

# calculate the sensitivity and specificty of the data
sensitivity = tp / (tp+fn)
specificity = tn / (fp+tn)
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

# calculate precision and recall
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print('precision: ', precision)
print('recall: ', recall)

21146 102 7409 794
sensitivity:  0.09679385590637572
specificity:  0.9951995481927711
precision:  0.8861607142857143
recall:  0.09679385590637572
