In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("adm_notes_exclude_dead_readm_updated.csv",
                     header=0)
df.columns
df.shape

(1316, 27)

In [3]:
data = pd.DataFrame()
data['label'] = (df['redays']<=30).astype('int')
data['summary'] = df['summary']
data.label.value_counts()

0    1161
1     155
Name: label, dtype: int64

In [4]:
data_dropna=data.dropna()
data_dropna['label'].value_counts()

0    1146
1     154
Name: label, dtype: int64

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
import string

In [6]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT =df.TEXT.str.replace('\n',' ')
    df.TEXT =df.TEXT.str.replace('\r',' ')
    return df

def tokenizer_better(text):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                'from','there','an','that','p','are','have','has','h','but','o',
                'namepattern','which','every','also','t','that']


# vect = CountVectorizer(max_features = 3000, tokenizer = tokenizer_better, stop_words = my_stop_words)
# # this could take a while
# data_dropna['x_input'] = vect.fit_transform(data_dropna['summary'])

In [12]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [13]:
# stratify the dataset into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(data_dropna['summary'], data_dropna['label'], 
                                                    test_size=0.15, random_state=42,
                                                    stratify = data_dropna['label'])

In [14]:
#from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.pipeline import Pipeline
#from sklearn.linear_model import SGDClassifier
#from sklearn import svm

#stratify X_train to training and validation dataset
skf = StratifiedKFold(n_splits=5,random_state = 42)
skf.get_n_splits(X_train, y_train)

parameter_candidates = {
  'svm__C': [0.001,0.01, 0.1, 1],
  'svm__class_weight':['balanced',None, {0:0.86,1:0.14},{0:0.1,1:0.9}]}


grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

scores = ['roc_auc', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)

    text_svm4 = Pipeline([
         ('vect', CountVectorizer(lowercase = True, 
                                  max_features = 4000, 
                                  tokenizer = tokenizer_better,
                                  stop_words =my_stop_words)),
#          ('tfidf', TfidfTransformer()), #lower performs
        # ('lg', LogisticRegression(max_iter=10000, tol=0.1)),#0.83
         ('svm', LinearSVC(random_state=0))
        ])
    
    search2= GridSearchCV(estimator=text_svm4, 
                          param_grid = parameter_candidates,
                          cv = skf,
                          scoring= score)
    search2.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(search2.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = search2.cv_results_['mean_test_score']
    stds = search2.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, search2.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, search2.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for roc_auc




Best parameters set found on development set:

{'svm__C': 0.001, 'svm__class_weight': {0: 0.1, 1: 0.9}}

Grid scores on development set:

0.478 (+/-0.080) for {'svm__C': 0.001, 'svm__class_weight': 'balanced'}
0.493 (+/-0.102) for {'svm__C': 0.001, 'svm__class_weight': None}
0.500 (+/-0.116) for {'svm__C': 0.001, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.504 (+/-0.086) for {'svm__C': 0.001, 'svm__class_weight': {0: 0.1, 1: 0.9}}
0.474 (+/-0.048) for {'svm__C': 0.01, 'svm__class_weight': 'balanced'}
0.474 (+/-0.046) for {'svm__C': 0.01, 'svm__class_weight': None}
0.488 (+/-0.086) for {'svm__C': 0.01, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.474 (+/-0.069) for {'svm__C': 0.01, 'svm__class_weight': {0: 0.1, 1: 0.9}}
0.472 (+/-0.050) for {'svm__C': 0.1, 'svm__class_weight': 'balanced'}
0.473 (+/-0.049) for {'svm__C': 0.1, 'svm__class_weight': None}
0.472 (+/-0.040) for {'svm__C': 0.1, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.474 (+/-0.046) for {'svm__C': 0.1, 'svm__class_weight': {0: 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'svm__C': 0.001, 'svm__class_weight': {0: 0.1, 1: 0.9}}

Grid scores on development set:

0.112 (+/-0.083) for {'svm__C': 0.001, 'svm__class_weight': 'balanced'}
0.071 (+/-0.086) for {'svm__C': 0.001, 'svm__class_weight': None}
0.000 (+/-0.000) for {'svm__C': 0.001, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.148 (+/-0.087) for {'svm__C': 0.001, 'svm__class_weight': {0: 0.1, 1: 0.9}}
0.105 (+/-0.077) for {'svm__C': 0.01, 'svm__class_weight': 'balanced'}
0.083 (+/-0.047) for {'svm__C': 0.01, 'svm__class_weight': None}
0.071 (+/-0.054) for {'svm__C': 0.01, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.112 (+/-0.083) for {'svm__C': 0.01, 'svm__class_weight': {0: 0.1, 1: 0.9}}
0.104 (+/-0.086) for {'svm__C': 0.1, 'svm__class_weight': 'balanced'}
0.105 (+/-0.085) for {'svm__C': 0.1, 'svm__class_weight': None}
0.108 (+/-0.032) for {'svm__C': 0.1, 'svm__class_weight': {0: 0.86, 1: 0.14}}
0.110 (+/-0.087) for {'svm__C': 0.1, 'svm__class_weight': {0: 