In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR = "D:/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier

Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.2)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
print(df_train.shape)
print(df_test.shape)

(5072, 10)
(1170, 10)


Sparse Matrix with word count

In [15]:
vect = CountVectorizer(lowercase = True, max_features = 3000, tokenizer = tokenize_stem)
vect.fit(df_train['TEXT'].values)

X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)

y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']

In [16]:
metrics = ['roc_auc', 'accuracy', 'precision', 'recall']

## Support Vector Machine

In [17]:
grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    #'C': np.logspace(-3, 3, 7),
    'gamma': np.logspace(-3, 3, 7)
}

model_svm = GridSearchCV(SVC(), param_grid=grid)
model_svm.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ['linear', 'poly', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
model_svm.best_params_

{'C': 1.0, 'gamma': 0.001, 'kernel': 'rbf'}

In [28]:
model_svm = SVC(C=0.3, gamma=0.001, kernel='rbf')
scores_svm = cross_validate(model_svm, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [29]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_svm['train_roc_auc']), '\t', np.std(scores_svm['train_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['train_accuracy']), '\t', np.std(scores_svm['train_accuracy']))
print('Precission\t', np.mean(scores_svm['train_precision']), '\t', np.std(scores_svm['train_precision']))
print('Recall\t\t', np.mean(scores_svm['train_precision']), '\t', np.std(scores_svm['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.8145104008788049 	 0.004275818494146444
Accuracy	 0.6781350963166208 	 0.004081143515638702
Precission	 0.6542063669206837 	 0.004340858030969219
Recall		 0.6542063669206837 	 0.004340858030969219


In [30]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_svm['test_roc_auc']), '\t', np.std(scores_svm['test_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['test_accuracy']), '\t', np.std(scores_svm['test_accuracy']))
print('Precission\t', np.mean(scores_svm['test_precision']), '\t', np.std(scores_svm['test_precision']))
print('Recall\t\t', np.mean(scores_svm['test_recall']), '\t', np.std(scores_svm['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6469943328308707 	 0.018284489757006056
Accuracy	 0.6013480242127457 	 0.01931114633112478
Precission	 0.5851073291679674 	 0.015726014728589543
Recall		 0.6959837860504123 	 0.023928659714693476


## Multi-layer Perceptron

In [25]:
grid = {
    'alpha': [20, 10, 1, 1e-1, 1e-2, 1e-3, 1e-4],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.1, 0.01, 0.001, 0.0001, 0.00001]
}

model_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10, 2)), param_grid=grid)
model_mlp.fit(X_test_tf, y_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(10, 2),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_g

In [26]:
model_mlp.best_params_

{'alpha': 20, 'learning_rate_init': 0.001, 'solver': 'sgd'}

In [27]:
model_mlp = MLPClassifier(
    hidden_layer_sizes=(10,2), alpha=20, learning_rate_init=0.001, solver='sgd', early_stopping=True
)
scores_mlp = cross_validate(model_mlp, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [28]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_mlp['train_roc_auc']), '\t', np.std(scores_mlp['train_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['train_accuracy']), '\t', np.std(scores_mlp['train_accuracy']))
print('Precission\t', np.mean(scores_mlp['train_precision']), '\t', np.std(scores_mlp['train_precision']))
print('Recall\t\t', np.mean(scores_mlp['train_precision']), '\t', np.std(scores_mlp['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.7061227538338111 	 0.10796223620358308
Accuracy	 0.6532350540471243 	 0.07987986195692212
Precission	 0.6630475489130941 	 0.08513710024380562
Recall		 0.6630475489130941 	 0.08513710024380562


In [29]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_mlp['test_roc_auc']), '\t', np.std(scores_mlp['test_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['test_accuracy']), '\t', np.std(scores_mlp['test_accuracy']))
print('Precission\t', np.mean(scores_mlp['test_precision']), '\t', np.std(scores_mlp['test_precision']))
print('Recall\t\t', np.mean(scores_mlp['test_recall']), '\t', np.std(scores_mlp['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6477122002083795 	 0.07011412374768287
Accuracy	 0.6057009049184596 	 0.05255762733636115
Precission	 0.6139944104278925 	 0.059283591625727626
Recall		 0.6800513196480938 	 0.16582012574228114
