In [25]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Lambda
from keras.layers import Embedding
from keras.models import Model, Sequential
import tensorflow as tf
import pandas as pd

In [28]:
data = pd.read_csv("data.csv")

In [29]:
data.shape

(57280, 4)

In [30]:
data.head()

Unnamed: 0,fileid,sub_categories,categories,converse
0,0,REFILL,PRESCRIPTION,patients aware that he needs rov for refill na...
1,1,MEDICATION RELATED,ASK_A_DOCTOR,mom wants to know if the drugname needs some d...
2,2,MEDICATION RELATED,ASK_A_DOCTOR,patients to discuss drugname she says she has ...
3,3,OTHERS,MISCELLANEOUS,fyi nortryptline medication patient prescripti...
4,4,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",MISCELLANEOUS,letter of patient establishment request name s...


In [31]:
data.dtypes

fileid             int64
sub_categories    object
categories        object
converse          object
dtype: object

In [32]:
# Remove fileid column as this is not required
fileid = data.pop('fileid')

In [33]:
data.head()

Unnamed: 0,sub_categories,categories,converse
0,REFILL,PRESCRIPTION,patients aware that he needs rov for refill na...
1,MEDICATION RELATED,ASK_A_DOCTOR,mom wants to know if the drugname needs some d...
2,MEDICATION RELATED,ASK_A_DOCTOR,patients to discuss drugname she says she has ...
3,OTHERS,MISCELLANEOUS,fyi nortryptline medication patient prescripti...
4,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",MISCELLANEOUS,letter of patient establishment request name s...


In [34]:
len(data.categories.unique())

6

In [35]:
len(data.sub_categories.unique())

21

In [36]:
# Let us fix up the target as categories to start with
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [37]:
# Converting the categories to label
data.categories = le.fit_transform(data.categories)

In [38]:
# Converting the subcategories to labels
data.sub_categories = le.fit_transform(data.sub_categories)

In [39]:
data.head()

Unnamed: 0,sub_categories,categories,converse
0,15,5,patients aware that he needs rov for refill na...
1,7,1,mom wants to know if the drugname needs some d...
2,7,1,patients to discuss drugname she says she has ...
3,9,4,fyi nortryptline medication patient prescripti...
4,18,4,letter of patient establishment request name s...


In [40]:
y1 = data.pop('categories')
y2 = data.pop('sub_categories')
X = data.as_matrix()

In [41]:
X.shape

(57280, 1)

In [42]:
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 39288
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

In [43]:
X = np.array(X).flatten()

In [17]:
X=X.astype(str)

In [44]:
X.shape

(57280,)

In [47]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

AttributeError: 'float' object has no attribute 'lower'

In [22]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(y1))
labels2 = to_categorical(np.asarray(y2))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print('Shape of label tensor:', labels2.shape)

Found 39288 unique tokens.


ValueError: invalid literal for int() with base 10: 'PRESCRIPTION'

In [21]:
labels[0]

NameError: name 'labels' is not defined

In [16]:
labels2[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.])

In [27]:
# making the train and validation datasets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
labels2 = labels2[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples] # This is for categories as target
y_train2 = labels2[:-num_validation_samples] # This is for subcategories as target

x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:] # This is for categories as target
y_val2 = labels2[-num_validation_samples:] # This is for subcategories as target

In [28]:
# Single hidden layer model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= x_train.shape[1] ))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(6, activation='softmax'))

In [29]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [30]:
#model.fit(data, y=y, batch_size=200, epochs=1, verbose=0, validation_split=0.2, shuffle=True)

model.fit(x_train, y_train,
          batch_size=64,
          epochs=1,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/1


<keras.callbacks.History at 0x24d86c52080>

In [31]:
# The above model is clearly overfitting. So lets add some dropout and do batch norm
from keras.layers.normalization import BatchNormalization
from keras.layers import Activation

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= data.shape[1] ))
model.add(Flatten())
model.add(Dropout(0.2))

# hidden Layer 1
model.add(Dense(100))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# hidden Layer 2
model.add(Dense(50))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(6, activation='softmax'))

In [32]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [34]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/1


<keras.callbacks.History at 0x24d8a9274e0>

### USING GLOVE word2vec instead of plain embeddings

In [35]:
# Lets try glove embeddings instead of plain embeddings

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B.50d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset


In [36]:
embedding_dimension = 50
word_index = tokenizer.word_index

In [37]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

In [38]:
embedding_matrix.shape

(39289, 50)

In [None]:
data.shape

In [39]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=data.shape[1])

In [40]:
model = Sequential()
# num_words is the number of unique words
# Embedding dimension is the dimension of the hidden layer that we choose --> embedding vector ( we can choose this to be 50, or 100 or so)
# input length is the fixed length of the sentence that we feed to this embedding layer network ( https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work)
# Here we mentioned trainable = false, because we are directly using the weights from embedding matrix from glove. 
model.add(embedding_layer)
#model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Flatten())

model.add(Dropout(0.2))

#Hidden layer 1
model.add(Dense(50))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

#Hidden layer 2
model.add(Dense(25))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(6, activation='softmax'))

In [41]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [43]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/1


<keras.callbacks.History at 0x24d9959c400>

## Using LSTM instead of MLP
### Hence forward we will be using plain embeddings instead of glove word2vec as it has not given great performance ( Suspect words like Rx etc., are being omitted from glove vocabulary)

In [44]:
from keras.layers.recurrent import LSTM

model = Sequential()
# num_words is the number of unique words
# Embedding dimension is the dimension of the hidden layer that we choose --> embedding vector ( we can choose this to be 50, or 100 or so)
# input length is the fixed length of the sentence that we feed to this embedding layer network ( https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work)
# Here we mentioned trainable = false, because we are directly using the weights from embedding matrix from glove. 
#model.add(embedding_layer)
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= data.shape[1] ))
#model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
#model.add(Flatten())
model.add(LSTM(256))
model.add(Dense(6, activation='softmax'))

In [45]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [46]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=1,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/1


<keras.callbacks.History at 0x24d9e3fad68>

### Using a Convolution net

In [47]:
from keras.layers import Conv1D, GlobalMaxPooling1D

filters = 256
kernel_size = 3
hidden_dims = 100

In [108]:
model = Sequential()

# model.add(embedding_layer) --> This uses Glove
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= x_train.shape[1] )) # --> This is plain embeddings

#model.add(Flatten()) # Not reqiured here (its giving an error if used)

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(6, activation='softmax'))

In [109]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [110]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x24df7c06860>

In [None]:
model.predict(x_val)

In [None]:
y_val

### CNN + LSTM

In [105]:
from keras.layers.convolutional import MaxPooling1D

model = Sequential()

# model.add(embedding_layer) --> This uses Glove
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= x_train.shape[1] )) # --> This is plain embeddings

#model.add(Flatten()) # Not reqiured here (its giving an error if used)

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(64,
                 kernel_size,
                 padding='same',
                 activation='relu',
                 strides=1))

# we use max pooling:
model.add(MaxPooling1D(pool_size=2))

model.add(LSTM(100))

model.add(Dropout(0.2))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(6, activation='softmax'))

In [None]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [106]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x24df50adf28>

## Prediction for the subcategories:
### We will use Conv net model as this gave the best results for the classification of categories

In [51]:
model2 = Sequential()
model2.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length= x_train.shape[1] )) # --> This is plain embeddings

model2.add(Dropout(0.2))

model2.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

# we use max pooling:
model2.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model2.add(Dense(hidden_dims))
model2.add(Dropout(0.2))
model2.add(Activation('relu'))

model2.add(Dense(21, activation='softmax'))

In [52]:
from keras import callbacks
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.00001, verbose=1, epsilon=0.001)
early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=1, mode='auto')
model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [88]:
model2.fit(x_train, y_train2,
          batch_size=64,
          epochs=2,
          validation_data=(x_val, y_val2),
          callbacks=[reduce_lr, early_stop])

Train on 45824 samples, validate on 11456 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x24da1821668>

In [58]:
preds = model2.predict(x_val)

In [85]:
preds1 = model.predict(x_val)

In [89]:
y2_true = []
for i in range(0,len(y_val2)):
    y2_true.append(np.argmax(y_val2[i]))

In [84]:
y1_true = []
for i in range(0,len(y_val)):
    y1_true.append(np.argmax(y_val[i]))

In [90]:
y2_pred = []
for i in range(0,len(y_val2)):
    y2_pred.append(np.argmax(preds[i]))

In [86]:
y1_pred = []
for i in range(0,len(y_val)):
    y1_pred.append(np.argmax(preds1[i]))

In [77]:
from sklearn.metrics import recall_score

In [91]:
recall_score(y2_true, y2_pred, average = 'macro')

0.45417673811002829

In [87]:
recall_score(y1_true, y1_pred, average = 'macro')

0.63821020088978797