In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
import nltk
from nltk.tokenize import word_tokenize

In [28]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# Step 1. Data Preparation

In [4]:
df = pd.read_csv("adm_notes_exclude_dead_readm.csv",
                     header=0)
df.columns
df.shape

(1394, 27)

In [5]:
data = pd.DataFrame()
data['label'] = (df['redays']<=30).astype('int')
data['summary'] = df['summary']
data.label.value_counts()

0    1161
1     233
Name: label, dtype: int64

In [6]:
data_dropna=data.dropna()
data_dropna

Unnamed: 0,label,summary
0,0,Admission Date: [**2183-3-23**] ...
1,0,Admission Date: [**2139-9-8**] Discharge ...
2,0,Admission Date: [**2146-11-17**] ...
3,0,Admission Date: [**2146-12-22**] ...
4,0,Admission Date: [**2199-9-1**] D...
...,...,...
1389,0,[** **] Date: [**2116-7-1**] Dis...
1390,1,Admission Date: [**2109-6-16**] ...
1391,0,Admission Date: [**2126-2-20**] ...
1392,1,Admission Date: [**2117-4-13**] ...


In [7]:
data_dropna['label'].value_counts()

0    1146
1     231
Name: label, dtype: int64

In [8]:
# Divide by class
df_class_0 = data_dropna[data_dropna['label'] == 0]
df_class_1 = data_dropna[data_dropna['label'] == 1]

#### Unbalanced dataset, so that we need up-sampling the positive samples.

In [9]:
df_class_1_upsample = df_class_1.sample(1146, replace=True)

In [10]:
df_over = pd.concat([df_class_1_upsample, df_class_0], axis=0)
df_over.shape

(2292, 2)

In [13]:
# shuffle the order of training samples 
df_all = df_over.sample(n = 2292, random_state = 42).reset_index(drop = True)
df_all

Unnamed: 0,label,summary
0,0,Admission Date: [**2124-10-18**] ...
1,0,Admission Date: [**2184-9-14**] Dischar...
2,0,Admission Date: [**2150-2-9**] D...
3,0,Admission Date: [**2140-8-12**] Discharge ...
4,0,Admission Date: [**2138-4-21**] ...
...,...,...
2287,0,Admission Date: [**2159-4-27**] ...
2288,1,Admission Date: [**2178-6-26**] ...
2289,1,Admission Date: [**2118-3-5**] D...
2290,0,Admission Date: [**2193-8-10**] ...


#### Setup training and test data set

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
x_train,x_test,y_train,y_test=train_test_split(df_all['summary'],df_all['label'],test_size=0.2)

 # Step2: Process text data

In [12]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT =df.TEXT.str.replace('\n',' ')
    df.TEXT =df.TEXT.str.replace('\r',' ')
    return df

In [21]:
#df_train = preprocess_text(df_all['summary'])
df_all['summary'].astype(str)

0       Admission Date:  [**2124-10-18**]             ...
1       Admission Date:  [**2184-9-14**]       Dischar...
2       Admission Date:  [**2150-2-9**]              D...
3       Admission Date:  [**2140-8-12**]    Discharge ...
4       Admission Date:  [**2138-4-21**]              ...
                              ...                        
2287    Admission Date:  [**2159-4-27**]              ...
2288    Admission Date:  [**2178-6-26**]              ...
2289    Admission Date:  [**2118-3-5**]              D...
2290    Admission Date:  [**2193-8-10**]              ...
2291    Admission Date:  [**2144-8-12**]       Dischar...
Name: summary, Length: 2292, dtype: object

In [25]:
#function to split text into word
word_tokenize('This should be tokenized. 02/02/2018 sentence has stars**')

['This',
 'should',
 'be',
 'tokenized',
 '.',
 '02/02/2018',
 'sentence',
 'has',
 'stars**']

In [99]:
def tokenizer_better(text):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

In [100]:
tokenizer_better('This should be tokenized. 02/02/2018 sentence has stars**')

['this', 'should', 'be', 'tokenized', 'sentence', 'has', 'stars']

### Build a simple vectorizer

In [116]:
from sklearn.feature_extraction.text import CountVectorizer

sample_text = ['Data science is about the data', 'The science is amazing', 'Predictive modeling is part of data science']

vect = CountVectorizer(tokenizer = tokenizer_better)
vect.fit(sample_text)

# matrix is stored as a sparse matrix (since you have a lot of zeros)
X = vect.transform(sample_text)

In [117]:
X.toarray()

array([[1, 0, 2, 1, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 0]])

In [118]:
# get the column names
vect.get_feature_names()

['about',
 'amazing',
 'data',
 'is',
 'modeling',
 'of',
 'part',
 'predictive',
 'science',
 'the']

### Get another example from clinical notes

In [115]:
x_text = tokenizer_better(x_train[0])
vect.fit(x_text)
vect.transform(x_text).toarray().shape
vect.get_feature_names()

['aado',
 'abdomen',
 'abg',
 'abnormalities',
 'above',
 'absense',
 'acetaminophen',
 'acute',
 'admission',
 'admitted',
 'afib',
 'after',
 'ago',
 'allergies',
 'alone',
 'along',
 'alternating',
 'ampicillin',
 'anemia',
 'angap',
 'anteriorly',
 'anterolisthesis',
 'anticoagulation',
 'any',
 'aortic',
 'apex',
 'apnea',
 'appear',
 'appointment',
 'appointments',
 'area',
 'areas',
 'arousable',
 'artery',
 'asa',
 'asleep',
 'assessed',
 'atelectasis',
 'atraumatic',
 'atrial',
 'attending',
 'awake',
 'axilla',
 'b',
 'bacteri',
 'base',
 'based',
 'baso',
 'began',
 'bid',
 'bilateral',
 'bilaterally',
 'bilirub',
 'bladder',
 'bleed',
 'bleeds',
 'blood',
 'blowing',
 'both',
 'bowel',
 'bp',
 'bps',
 'brain',
 'breath',
 'breathing',
 'brief',
 'brought',
 'building',
 'bulk',
 'c',
 'calcium',
 'call',
 'caltco',
 'came',
 'can',
 'capsule',
 'cardiac',
 'cardiologist',
 'cardiology',
 'cardiomegaly',
 'care',
 'cat',
 'center',
 'cerebral',
 'chair',
 'change',
 'changes

### build a vectorizer on the clinical notes

In [71]:
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                'from','there','an','that','p','are','have','has','h','but','o',
                'namepattern','which','every','also','t','that']
vect = CountVectorizer(max_features = 3000, tokenizer = tokenizer_better, stop_words = my_stop_words)
# this could take a while
X_train_counts = vect.fit_transform(x_train.values)

# Step 3: Training the model

### Experiments with adding tf-idf

In [72]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1833, 3000)

### Buidling the whole pipeline

In [108]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_svm = Pipeline([
     ('vect', CountVectorizer(lowercase = True, max_features = 4000, tokenizer = tokenizer_better,stop_words =my_stop_words)),
     #('tfidf', TfidfTransformer()), #without this 0.856
     ('svm', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),#0.83
    ])


text_svm.fit(x_train, y_train)

predicted = text_svm.predict(x_test)
np.mean(predicted == y_test)

0.8562091503267973