In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR = "D:/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier

Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.3)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
df_test, df_valid = train_test_split(df_test, test_size=0.5)

In [15]:
print(df_train.shape)
print(df_test.shape)
print(df_valid.shape)

(4410, 10)
(916, 10)
(916, 10)


Sparse Matrix with word count

In [16]:
vect = CountVectorizer(lowercase = True, max_features = 3000, tokenizer = tokenize_stem)
vect.fit(df_train['TEXT'].values)

X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)
X_valid_tf = vect.transform(df_valid['TEXT'].values)

y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']
y_valid = df_valid['READM_WITHIN_30']

## Support Vector Machine

In [17]:
grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 2]
}

model_svm = GridSearchCV(SVC(), param_grid=grid)
model_svm.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 2], 'kernel': ['linear', 'poly', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
model_svm.best_params_

{'C': 1, 'kernel': 'rbf'}

In [19]:
model_svm = SVC(kernel='rbf', C=1).fit(X_train_tf, y_train)

In [20]:
y_train_preds = model_svm.predict(X_train_tf)
y_valid_preds = model_svm.predict(X_valid_tf)

In [21]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [22]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.7879818594104309 0.6264399603295698
Accuracy	 0.7879818594104309 0.6255458515283843
Precission	 0.8535634743875279 0.6523929471032746
Recall		 0.6952380952380952 0.5581896551724138


## Multi-layer Perceptron

In [23]:
grid = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.01,0.001,0.0001]
}

model_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10, 2)), param_grid=grid)
model_mlp.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(10, 2),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_g

In [24]:
model_mlp.best_params_

{'alpha': 0.1, 'learning_rate_init': 0.001, 'solver': 'lbfgs'}

In [25]:
model_mlp = MLPClassifier(hidden_layer_sizes=(10,2), alpha=0.1, learning_rate_init=0.001, solver='lbfgs').fit(X_train_tf, y_train)

In [26]:
y_train_preds = model_mlp.predict(X_train_tf)
y_valid_preds = model_mlp.predict(X_valid_tf)

In [27]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [28]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.9607709750566893 0.5913850320415014
Accuracy	 0.9607709750566893 0.5906113537117904
Precission	 1.0 0.6098765432098765
Recall		 0.9215419501133787 0.5323275862068966


## Random Forest

In [29]:
grid = {
    'max_depth': [10, 20, 30, 40],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 5],
    'n_estimators': [100, 200, 300]
}
model_rf = GridSearchCV(RandomForestClassifier(bootstrap=False), param_grid=grid)
model_rf.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              r

In [30]:
model_rf.best_params_

{'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [31]:
model_rf = RandomForestClassifier(
    bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=2, n_estimators=100
).fit(X_train_tf, y_train)

In [32]:
y_train_preds = model_rf.predict(X_train_tf)
y_valid_preds = model_rf.predict(X_valid_tf)

In [33]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [34]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.8827664399092969 0.6196025328043943
Accuracy	 0.882766439909297 0.618995633187773
Precission	 0.9884259259259259 0.6378896882494005
Recall		 0.7746031746031746 0.5732758620689655


## Naive Bayes

In [35]:
grid = {'alpha': [10, 1, 0, 0.1, 0.01, 0.001]}

model_nb = GridSearchCV(MultinomialNB(), param_grid=grid)
model_nb.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [10, 1, 0, 0.1, 0.01, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [36]:
model_nb.best_params_

{'alpha': 1}

In [37]:
model_nb = MultinomialNB(alpha=1).fit(X_train_tf, y_train)

In [38]:
y_train_preds = model_nb.predict(X_train_tf)
y_valid_preds = model_nb.predict(X_valid_tf)

In [39]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [40]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.6410430839002267 0.6230641592920354
Accuracy	 0.6410430839002268 0.6222707423580786
Precission	 0.6613070539419087 0.6460396039603961
Recall		 0.5782312925170068 0.5625


There is no improvement when using the whole Dataset with ComplementNB, same scores on test

## XGBoost

In [41]:
grid = {
    'min_child_weight': [1, 5],
    'gamma': [0.5, 1, 1.5],
    'max_depth': [3, 5],
    'eta': [0.1,0.01,0.001]
}

model_xgb = GridSearchCV(XGBClassifier(), param_grid=grid)
model_xgb.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,

In [42]:
model_xgb.best_params_

{'eta': 0.01, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1}

In [43]:
model_xgb = XGBClassifier(
    min_child_weight=1, gamma=0.5, max_depth=3, eta=0.01
).fit(X_train_tf, y_train)

In [44]:
y_train_preds = model_xgb.predict(X_train_tf)
y_valid_preds = model_xgb.predict(X_valid_tf)

In [45]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [46]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.661904761904762 0.609990082392432
Accuracy	 0.6619047619047619 0.6091703056768559
Precission	 0.683641975308642 0.6318407960199005
Recall		 0.6027210884353742 0.5474137931034483
