In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import pandas as pd
import numpy as np
import re

from bag_of_words import *
from regex_predictor import *
from odds_ratio import *
from proximal import *

In [24]:
master_data = pd.read_csv('csv_files/master_data.csv')

### Bag of words linear regression to see if text data carries information about kidney failure.

In [54]:
text_clf = create_bow_model(max_words=25000, ngram_range=(1, 2))

text_clf.fit(master_data['notes'], master_data['kidney_fail'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
print('majority class', 1-np.mean(master_data['kidney_fail']))

majority class 0.7958303622966962


In [53]:
predicted = text_clf.predict(master_data['notes'])

print(np.mean(predicted == master_data['kidney_fail']))

0.8756578723982208


In [49]:
create_confusion_matrix(master_data['kidney_fail'], predicted)

{'tn': 22154,
 'fp': 1284,
 'fn': 2443,
 'tp': 3570,
 'sensitivity': 0.5937136204889406,
 'specificity': 0.9452171687004011,
 'precision': 0.7354758961681088,
 'recall': 0.5937136204889406}

In [50]:
coef = text_clf['lr'].coef_[0]
print(coef)

words = text_clf['vect'].get_feature_names_out()
print(words)

sorted({w:c for w, c in zip(words, coef)}.items(), key=lambda kv: -kv[1])

[ 0.25481643 -1.38274298 -1.4613475  ... -0.13268737 -0.63930574
  1.36125577]
['00' '00 m2' '00 test' ... 'zofran' 'zofran with' 'zosyn']


[('severe resting', 14.18079834019926),
 ('but probably', 13.274504486175031),
 ('renal', 12.460282368507972),
 ('reversed', 12.411476759708977),
 ('size moderate', 12.361445011356592),
 ('27 left', 11.972686130921241),
 ('effusion is', 11.860669058854816),
 ('function hypotension', 11.821448020696677),
 ('tachycardia probable', 11.8142198554747),
 ('2169 10', 11.771615699454372),
 ('sepsis', 11.379148696850216),
 ('moderate 10', 11.265835221416005),
 ('consider myocardial', 11.210142012130051),
 ('delay', 11.159863898010174),
 ('more leftward', 11.11108706340334),
 ('conduction defect', 10.900519519582613),
 ('10 13', 10.83201386490612),
 ('hypoxia', 10.776756716279472),
 ('interval low', 10.730059185941956),
 ('valve abscess', 10.715770664360225),
 ('renal failure', 10.631722765237772),
 ('impression severe', 10.62080854711739),
 ('voltage throughout', 10.614731884337546),
 ('failure', 10.594155751503619),
 ('24 there', 10.55065867384705),
 ('leads with', 10.517374186702327),
 ('2167

### Try out regex predictors.

In [125]:
# doppler, *pressure (high blood pressure causes kidney failure), **valve (but may not be correlated enough, has 50% accuracy)
regex_predicted = regular_expression_predict(master_data['notes_half2'], ['pressure'])

print(np.mean(regex_predicted))

print(np.mean(regex_predicted == master_data['kidney_fail']))

create_confusion_matrix(master_data['kidney_fail'], regex_predicted)

0.2072595158059149
0.7135241587721979


{'tn': 19174,
 'fp': 4264,
 'fn': 4173,
 'tp': 1840,
 'sensitivity': 0.30600365873939794,
 'specificity': 0.818073214438092,
 'precision': 0.30144167758846657,
 'recall': 0.30600365873939794}