In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR = "D:/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier

Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.2)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
print(df_train.shape)
print(df_test.shape)

(5068, 10)
(1174, 10)


Sparse Matrix with word count

In [15]:
vect = CountVectorizer(lowercase = True, max_features = 3000, tokenizer = tokenize_stem)
vect.fit(df_train['TEXT'].values)

X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)

y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']

In [16]:
metrics = ['roc_auc', 'accuracy', 'precision', 'recall']

## Support Vector Machine

In [17]:
grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 2]
}

model_svm = GridSearchCV(SVC(), param_grid=grid)
model_svm.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 2], 'kernel': ['linear', 'poly', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
model_svm.best_params_

{'C': 2, 'kernel': 'rbf'}

In [19]:
model_svm = SVC(C=1, kernel='rbf')
scores_svm = cross_validate(model_svm, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [20]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_svm['train_roc_auc']), '\t', np.std(scores_svm['train_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['train_accuracy']), '\t', np.std(scores_svm['train_accuracy']))
print('Precission\t', np.mean(scores_svm['train_precision']), '\t', np.std(scores_svm['train_precision']))
print('Recall\t\t', np.mean(scores_svm['train_precision']), '\t', np.std(scores_svm['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.8632660420404807 	 0.004986481194719112
Accuracy	 0.7913378514590634 	 0.0017314191907619578
Precission	 0.852648441838733 	 0.0018856600132221612
Recall		 0.852648441838733 	 0.0018856600132221612


In [21]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_svm['test_roc_auc']), '\t', np.std(scores_svm['test_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['test_accuracy']), '\t', np.std(scores_svm['test_accuracy']))
print('Precission\t', np.mean(scores_svm['test_precision']), '\t', np.std(scores_svm['test_precision']))
print('Recall\t\t', np.mean(scores_svm['test_recall']), '\t', np.std(scores_svm['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6799194189410096 	 0.006507489176104674
Accuracy	 0.629243697806231 	 0.006008071663082114
Precission	 0.6471547401149698 	 0.0060891957785869175
Recall		 0.5690717309446407 	 0.029542624178309763


## Multi-layer Perceptron

In [22]:
grid = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.01,0.001,0.0001]
}

model_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10, 2)), param_grid=grid)
model_mlp.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(10, 2),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_g

In [23]:
model_mlp.best_params_

{'alpha': 1e-05, 'learning_rate_init': 0.001, 'solver': 'adam'}

In [24]:
model_mlp = MLPClassifier(hidden_layer_sizes=(10,2), alpha=1e-05, learning_rate_init=0.001, solver='adam')
scores_mlp = cross_validate(model_mlp, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [25]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_mlp['train_roc_auc']), '\t', np.std(scores_mlp['train_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['train_accuracy']), '\t', np.std(scores_mlp['train_accuracy']))
print('Precission\t', np.mean(scores_mlp['train_precision']), '\t', np.std(scores_mlp['train_precision']))
print('Recall\t\t', np.mean(scores_mlp['train_precision']), '\t', np.std(scores_mlp['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.9508956053213081 	 0.05794534834095564
Accuracy	 0.9431752597638416 	 0.05203754365711709
Precission	 0.970551114850652 	 0.058897770298695834
Recall		 0.970551114850652 	 0.058897770298695834


In [26]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_mlp['test_roc_auc']), '\t', np.std(scores_mlp['test_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['test_accuracy']), '\t', np.std(scores_mlp['test_accuracy']))
print('Precission\t', np.mean(scores_mlp['test_precision']), '\t', np.std(scores_mlp['test_precision']))
print('Recall\t\t', np.mean(scores_mlp['test_recall']), '\t', np.std(scores_mlp['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6210032900480349 	 0.01415120926816873
Accuracy	 0.5913555728196171 	 0.01733559743474923
Precission	 0.5922297767228827 	 0.020006706166102454
Recall		 0.5939518675304629 	 0.07523969916347847


## Random Forest

In [27]:
grid = {
    'max_depth': [10, 20, 30, 40],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 5],
    'n_estimators': [100, 200, 300]
}
model_rf = GridSearchCV(RandomForestClassifier(bootstrap=False), param_grid=grid)
model_rf.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              r

In [28]:
model_rf.best_params_

{'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [29]:
model_rf = RandomForestClassifier(
    bootstrap=False, max_depth=20, max_features='auto', min_samples_leaf=2, min_samples_split=2, n_estimators=100
)
scores_rf = cross_validate(model_rf, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [30]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_rf['train_roc_auc']), '\t', np.std(scores_rf['train_roc_auc']))
print('Accuracy\t', np.mean(scores_rf['train_accuracy']), '\t', np.std(scores_rf['train_accuracy']))
print('Precission\t', np.mean(scores_rf['train_precision']), '\t', np.std(scores_rf['train_precision']))
print('Recall\t\t', np.mean(scores_rf['train_precision']), '\t', np.std(scores_rf['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.998641436612884 	 0.0001963476486064476
Accuracy	 0.9761244408865032 	 0.0027372607445500037
Precission	 0.9963324220509492 	 0.0037667293652973123
Recall		 0.9963324220509492 	 0.0037667293652973123


In [31]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_rf['test_roc_auc']), '\t', np.std(scores_rf['test_roc_auc']))
print('Accuracy\t', np.mean(scores_rf['test_accuracy']), '\t', np.std(scores_rf['test_accuracy']))
print('Precission\t', np.mean(scores_rf['test_precision']), '\t', np.std(scores_rf['test_precision']))
print('Recall\t\t', np.mean(scores_rf['test_recall']), '\t', np.std(scores_rf['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.680742707473823 	 0.00975732833577272
Accuracy	 0.6274683551697752 	 0.005915017189688171
Precission	 0.6256722980406854 	 0.008305525186672153
Recall		 0.6357571079979107 	 0.02535155229279958


## Naive Bayes

In [32]:
grid = {'alpha': [10, 1, 0, 0.1, 0.01, 0.001]}

model_nb = GridSearchCV(MultinomialNB(), param_grid=grid)
model_nb.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [10, 1, 0, 0.1, 0.01, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [33]:
model_nb.best_params_

{'alpha': 0}

In [34]:
model_nb = MultinomialNB(alpha=0)
scores_nb = cross_validate(model_nb, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [35]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_nb['train_roc_auc']), '\t', np.std(scores_nb['train_roc_auc']))
print('Accuracy\t', np.mean(scores_nb['train_accuracy']), '\t', np.std(scores_nb['train_accuracy']))
print('Precission\t', np.mean(scores_nb['train_precision']), '\t', np.std(scores_nb['train_precision']))
print('Recall\t\t', np.mean(scores_nb['train_precision']), '\t', np.std(scores_nb['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.69218452127824 	 0.0022113893959022747
Accuracy	 0.6444357766940386 	 0.004042200310114369
Precission	 0.6632772624655777 	 0.007331143228882922
Recall		 0.6632772624655777 	 0.007331143228882922


In [36]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_nb['test_roc_auc']), '\t', np.std(scores_nb['test_roc_auc']))
print('Accuracy\t', np.mean(scores_nb['test_accuracy']), '\t', np.std(scores_nb['test_accuracy']))
print('Precission\t', np.mean(scores_nb['test_precision']), '\t', np.std(scores_nb['test_precision']))
print('Recall\t\t', np.mean(scores_nb['test_recall']), '\t', np.std(scores_nb['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6572858257734869 	 0.009643894332157312
Accuracy	 0.6219423626971656 	 0.002980712106059144
Precission	 0.6366197187924656 	 0.004389079888898405
Recall		 0.5686733556298773 	 0.019751579133129315


There is no improvement when using the whole Dataset with ComplementNB, same scores on test

## XGBoost

In [37]:
grid = {
    'min_child_weight': [1, 5],
    'gamma': [0.5, 1, 1.5],
    'max_depth': [3, 5],
    'eta': [0.1,0.01,0.001]
}

model_xgb = GridSearchCV(XGBClassifier(), param_grid=grid)
model_xgb.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,

In [38]:
model_xgb.best_params_

{'eta': 0.01, 'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 1}

In [42]:
model_xgb = XGBClassifier(
    min_child_weight=1, gamma=1.5, max_depth=5, eta=0.01
)
scores_xgb = cross_validate(model_xgb, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [43]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_xgb['train_roc_auc']), '\t', np.std(scores_xgb['train_roc_auc']))
print('Accuracy\t', np.mean(scores_xgb['train_accuracy']), '\t', np.std(scores_xgb['train_accuracy']))
print('Precission\t', np.mean(scores_xgb['train_precision']), '\t', np.std(scores_xgb['train_precision']))
print('Recall\t\t', np.mean(scores_xgb['train_precision']), '\t', np.std(scores_xgb['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.8534756050665276 	 0.004636668297109635
Accuracy	 0.7695833984732621 	 0.0031830893233050373
Precission	 0.8083870675352307 	 0.015385041592389124
Recall		 0.8083870675352307 	 0.015385041592389124


In [44]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_xgb['test_roc_auc']), '\t', np.std(scores_xgb['test_roc_auc']))
print('Accuracy\t', np.mean(scores_xgb['test_accuracy']), '\t', np.std(scores_xgb['test_accuracy']))
print('Precission\t', np.mean(scores_xgb['test_precision']), '\t', np.std(scores_xgb['test_precision']))
print('Recall\t\t', np.mean(scores_xgb['test_recall']), '\t', np.std(scores_xgb['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6637300262817273 	 0.009254643796937403
Accuracy	 0.6223356717699493 	 0.005005507261220585
Precission	 0.6324251428083065 	 0.009856906144324181
Recall		 0.5864185981242838 	 0.03645122148202842
