In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import nltk
from nltk.tokenize import word_tokenize

In [4]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# Step 1. Data Preparation

In [5]:
df = pd.read_csv("adm_notes_exclude_dead_readm.csv",
                     header=0)
df.columns
df.shape

(1394, 27)

In [6]:
data = pd.DataFrame()
data['label'] = (df['redays']<=30).astype('int')
data['summary'] = df['summary']
data.label.value_counts()

0    1161
1     233
Name: label, dtype: int64

In [7]:
data_dropna=data.dropna()
data_dropna

Unnamed: 0,label,summary
0,0,Admission Date: [**2183-3-23**] ...
1,0,Admission Date: [**2139-9-8**] Discharge ...
2,0,Admission Date: [**2146-11-17**] ...
3,0,Admission Date: [**2146-12-22**] ...
4,0,Admission Date: [**2199-9-1**] D...
...,...,...
1389,0,[** **] Date: [**2116-7-1**] Dis...
1390,1,Admission Date: [**2109-6-16**] ...
1391,0,Admission Date: [**2126-2-20**] ...
1392,1,Admission Date: [**2117-4-13**] ...


In [8]:
data_dropna['label'].value_counts()

0    1146
1     231
Name: label, dtype: int64

In [9]:
data_dropna

Unnamed: 0,label,summary
0,0,Admission Date: [**2183-3-23**] ...
1,0,Admission Date: [**2139-9-8**] Discharge ...
2,0,Admission Date: [**2146-11-17**] ...
3,0,Admission Date: [**2146-12-22**] ...
4,0,Admission Date: [**2199-9-1**] D...
...,...,...
1389,0,[** **] Date: [**2116-7-1**] Dis...
1390,1,Admission Date: [**2109-6-16**] ...
1391,0,Admission Date: [**2126-2-20**] ...
1392,1,Admission Date: [**2117-4-13**] ...


Before upsampling the positive samples, we need to hold out the testing set from the orginal dataset.

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train_pre,x_test,y_train_pre,y_test=train_test_split(data_dropna['summary'],data_dropna['label'],test_size=0.15)

In [14]:
y_test.value_counts()

0    235
1     41
Name: label, dtype: int64

#### Unbalanced dataset, so that we up-sample the positive samples in training dataset

In [15]:
# concatenate our training data back together
X = pd.concat([x_train_pre, y_train_pre], axis=1)
X['label'].value_counts()

0    911
1    190
Name: label, dtype: int64

In [16]:
# Divide by class
df_class_0 = X[X['label'] == 0]
df_class_1 = X[X['label'] == 1]

In [17]:
df_class_1_upsample = df_class_1.sample(len(X[X['label']==0]), replace=True)

Try sklearn to upsample the data

In [216]:
from sklearn.utils import resample
df_class_1_upsample = resample(df_class_1,
                          replace=True, # sample with replacement
                          n_samples=len(df_class_0), # match number in majority class
                          random_state=27)

In [18]:
df_over = pd.concat([df_class_1_upsample, df_class_0], axis=0)
df_over.shape

(1822, 2)

In [19]:
# shuffle the order of training samples 
df_all = df_over.sample(len(df_over['label']), random_state = 42).reset_index(drop = True)
df_all

Unnamed: 0,summary,label
0,Admission Date: [**2126-5-24**] Dischar...,1
1,Admission Date: [**2189-12-2**] ...,0
2,Admission Date: [**2149-9-26**] ...,1
3,Admission Date: [**2179-10-13**] ...,1
4,Admission Date: [**2132-9-19**] Dischar...,1
...,...,...
1817,Admission Date: [**2130-7-15**] ...,0
1818,Admission Date: [**2135-9-8**] D...,0
1819,Admission Date: [**2121-11-20**] ...,1
1820,Admission Date: [**2119-5-9**] D...,0


#### Setup training and test data set

In [20]:
x_train = df_all['summary']
y_train = df_all['label']

 # Step2: Process text data

In [21]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT =df.TEXT.str.replace('\n',' ')
    df.TEXT =df.TEXT.str.replace('\r',' ')
    return df

In [22]:
#df_train = preprocess_text(df_all['summary'])
df_all['summary'].astype(str)

0       Admission Date: [**2126-5-24**]        Dischar...
1       Admission Date:  [**2189-12-2**]              ...
2       Admission Date:  [**2149-9-26**]              ...
3       Admission Date:  [**2179-10-13**]             ...
4       Admission Date:  [**2132-9-19**]       Dischar...
                              ...                        
1817    Admission Date:  [**2130-7-15**]              ...
1818    Admission Date:  [**2135-9-8**]              D...
1819    Admission Date:  [**2121-11-20**]             ...
1820    Admission Date:  [**2119-5-9**]              D...
1821    Admission Date:  [**2164-8-26**]     Discharge...
Name: summary, Length: 1822, dtype: object

In [23]:
#function to split text into word
word_tokenize('This should be tokenized. 02/02/2018 sentence has stars**')

['This',
 'should',
 'be',
 'tokenized',
 '.',
 '02/02/2018',
 'sentence',
 'has',
 'stars**']

In [24]:
def tokenizer_better(text):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

In [25]:
tokenizer_better('This should be tokenized. 02/02/2018 sentence has stars**')

['this', 'should', 'be', 'tokenized', 'sentence', 'has', 'stars']

### Build a simple vectorizer

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

sample_text = ['Data science is about the data', 'The science is amazing', 'Predictive modeling is part of data science']

vect = CountVectorizer(tokenizer = tokenizer_better)
vect.fit(sample_text)

# matrix is stored as a sparse matrix (since you have a lot of zeros)
X = vect.transform(sample_text)

In [27]:
X.toarray()

array([[1, 0, 2, 1, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 0]])

In [28]:
# get the column names
vect.get_feature_names()

['about',
 'amazing',
 'data',
 'is',
 'modeling',
 'of',
 'part',
 'predictive',
 'science',
 'the']

### Get another example from clinical notes

In [29]:
x_text = tokenizer_better(x_train[0])
vect.fit(x_text)
vect.transform(x_text).toarray().shape
vect.get_feature_names()

['a',
 'abdomen',
 'abutting',
 'actually',
 'admission',
 'afebrile',
 'alert',
 'all',
 'allergies',
 'also',
 'amiodarone',
 'an',
 'and',
 'aneurysm',
 'angio',
 'angiogram',
 'ankle',
 'arousable',
 'aspirin',
 'at',
 'atrial',
 'attempted',
 'awake',
 'b',
 'being',
 'birth',
 'blood',
 'bowel',
 'briskly',
 'bruits',
 'but',
 'by',
 'cad',
 'calculi',
 'cardiac',
 'cardiology',
 'carotid',
 'carotids',
 'cells',
 'change',
 'clip',
 'clipping',
 'clonus',
 'close',
 'coiling',
 'commands',
 'complication',
 'complications',
 'condition',
 'continued',
 'control',
 'convert',
 'course',
 'ct',
 'd',
 'date',
 'day',
 'decreased',
 'deep',
 'degrees',
 'developed',
 'diagnostic',
 'dictated',
 'did',
 'dilantin',
 'dilation',
 'discharge',
 'discharged',
 'discontinued',
 'done',
 'down',
 'downgoing',
 'dr',
 'easily',
 'echo',
 'ef',
 'enzymes',
 'eoms',
 'episodes',
 'equal',
 'essentially',
 'examination',
 'extremities',
 'eyes',
 'famotidine',
 'fibrillation',
 'findings',
 

### build a vectorizer on the clinical notes

In [30]:
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                'from','there','an','that','p','are','have','has','h','but','o',
                'namepattern','which','every','also','t','that']
vect = CountVectorizer(max_features = 3000, tokenizer = tokenizer_better, stop_words = my_stop_words)
# this could take a while
X_train_counts = vect.fit_transform(x_train.values)

In [None]:
X_train_counts

# Step 3: Training the model

### Experiments with adding tf-idf

In [159]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1838, 3000)

### Buidling the whole pipeline

In [31]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_svm = Pipeline([
     ('vect', CountVectorizer(lowercase = True, max_features = 4000, tokenizer = tokenizer_better,stop_words =my_stop_words)),
     #('tfidf', TfidfTransformer()), #lower performs
     ('svm', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),#0.83
    ])


text_svm.fit(x_train, y_train)

predicted = text_svm.predict(x_test)
np.mean(predicted == y_test)

0.7644927536231884

In [78]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
#scoring = ['precision_macro', 'recall_macro']
scoring = ['accuracy', 'roc_auc', 'balanced_accuracy','average_precision','precision','f1','recall']
#clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(text_svm, x_train, 
                        y_train, scoring=scoring, cv =5, 
                        return_estimator = True,
                        return_train_score =True)

#### The evaluation result from training/validation dataset

In [93]:
print("""test_accuracy mean value: {:.2f}, std: {:.2f} 
      \ntest_f1 mean value: {:.2f}
      \ntest_precision mean value: {:.2f}
      \ntest_recall mean value: {:.2f}"""
      .format(scores['test_accuracy'].mean(),
              scores['test_accuracy'].std(),
              scores['test_f1'].mean(),
              scores['test_precision'].mean(),
              scores['test_recall'].mean()))

test_accuracy mean value: 0.87, std: 0.03 
      
test_f1 mean value: 0.88
      
test_precision mean value: 0.82
      
test_recall mean value: 0.96


In [96]:
y_train_pred = cross_val_predict(text_svm,x_train,y_train,cv=5)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86       911
           1       0.82      0.96      0.88       911

    accuracy                           0.87      1822
   macro avg       0.88      0.87      0.87      1822
weighted avg       0.88      0.87      0.87      1822



#### Evaluate the model on the test dataset

In [94]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

y_test_pred = cross_val_predict(text_svm,x_test,y_test,cv=5)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       235
           1       0.21      0.20      0.20        41

    accuracy                           0.77       276
   macro avg       0.53      0.53      0.53       276
weighted avg       0.76      0.77      0.77       276



In [75]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)

array([[204,  31],
       [ 33,   8]])