In [1]:
import pandas as pd

In [2]:
data = pd.read_excel("data.xlsx", encoding='latin-1')

In [3]:
text = data.iloc[0,0]
text

"{'mesh_terms': ['Adult [D000328:minor]', 'Liposarcoma [D008080:minor]/drug therapy [Q000188:major]/secondary [Q000556:major]', 'Lung Neoplasms [D008175:minor]/drug therapy [Q000188:major]/secondary [Q000556:minor]', 'Remission Induction [D012074:minor]', 'Retroperitoneal Neoplasms [D012186:minor]/pathology [Q000473:major]', 'Vincristine [D014750:minor]/administration & dosage [Q000008:minor]', 'Antineoplastic Combined Chemotherapy Protocols [D000971:minor]/therapeutic use [Q000627:major]', 'Brain Neoplasms [D001932:minor]/drug therapy [Q000188:major]/secondary [Q000556:minor]', 'Cyclophosphamide [D003520:minor]/administration & dosage [Q000008:minor]', 'Dacarbazine [D003606:minor]/administration & dosage [Q000008:minor]', 'Doxorubicin [D004317:minor]/administration & dosage [Q000008:minor]', 'Drug Administration Schedule [D004334:minor]', 'Female [D005260:minor]', 'Humans [D006801:minor]'], 'text': '[A case of metastatic liposarcoma originating in the retroperitoneum successfully trea

In [4]:
len(data)

22396

In [5]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

Using TensorFlow backend.


## Data processing
After reading the data, I drop all the null values using pandas ‘dropna’ function. Then filter out the rows with non-numeric characters in the class column. Similarly, I also filtered out all the rows with empty comments.

In [6]:
df = data.dropna()
#df = df[df.Class.apply(lambda x: x !="")]
#df = df[df.Abstracts.apply(lambda x: x !="")]

Remove punctuations:

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    # Clean the text
    text = re.sub(r"\[\D\S{6}:\S{5}\]", "", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"\\n*", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r'\,|\.|\:|\&', "", text)
    text = re.sub(r'\{|\}|\[|\]', "", text)
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = [wn.lemmatize(word) for word in text]
    text = " ".join(stemmed_words)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mango\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mango\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df['Abstracts'] = df['Abstracts'].map(lambda x: clean_text(x))

In [9]:
df.iloc[0,0]

'mesh_term adult liposarcoma drug therapi secondari lung neoplasm drug therapi secondari remiss induct retroperiton neoplasm patholog vincristin administr dosag antineoplast combin chemotherapi protocol therapeut use brain neoplasm drug therapi secondari cyclophosphamid administr dosag dacarbazin administr dosag doxorubicin administr dosag drug administr schedul femal human text a case metastat liposarcoma origin retroperitoneum success treat combin chemotherapyw report 36-year-old woman metastat liposarcoma origin retroperitoneum respond well adjuv chemotherapi primari tumor remov surgeri two month later patient develop metastasi brain lung four month later metastat liposarcoma brain general extrem rare patient treat combin chemotherapi use cyclophosphamid vincristin adriamycin dacarbazin (cyvadic) examin former two drug altern vindesin ifosfamid anoth regimen cisplatin etoposid given three-week interv result metastas total disappear recurr lesion note two year although role chemother

In [10]:
vocabulary_size = 20000
def process(df):
    tokenizer = Tokenizer(num_words= vocabulary_size)
    tokenizer.fit_on_texts(df['Abstracts'])
    sequences = tokenizer.texts_to_sequences(df['Abstracts'])
    data = pad_sequences(sequences)
    labels = np.array(df['Class'])
    df_save = pd.DataFrame(data)
    df_save.head(10)
    df_label = pd.DataFrame(np.array(labels))
    datamat = pd.concat([df_save, df_label], axis = 1)
    return data, labels, datamat, tokenizer

In [11]:
x, y, vecdf, tokenizer = process(df)

In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(vecdf, test_size=0.2)

In [13]:
train.shape

(17916, 1132)

In [14]:
train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1]
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1]

In [15]:
test.shape

(4480, 1132)

In [16]:
train_x.shape

(17916, 1131)

In [17]:
train_y.shape

(17916,)

In [18]:
test_x.shape

(4480, 1131)

In [19]:
test_y.shape

(4480,)

In [20]:
df.shape

(22396, 2)

## Build neural network with LSTM
## Network Architechture
The network starts with an embedding layer. The layer lets the system expand each token to a more massive vector, allowing the network to represent a word in a meaningful way. The layer takes 20000 as the first argument, which is the size of our vocabulary, and 100 as the second input parameter, which is the dimension of the embeddings. The third parameter is the input_length of 50, which is the length of each comment sequence.

model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=train_x.shape[1]))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report

def GetMetrics(model,x,y):
    pred = model.predict_classes(x)
    pred_p=model.predict(x)
    fpr, tpr, thresholdTest = roc_curve(y, pred_p)
    aucv = auc(fpr, tpr) 
    #print('auc:',aucv)
    print('auc,acc,mcc',aucv,accuracy_score(y,pred),matthews_corrcoef(y,pred))
    print(classification_report(y,pred))

model_lstm.fit(train_x, np.array(train_y), validation_split=0.4, epochs=3)
model_lstm.save("Weights/model_lstm.h5")
model_lstm_json = model_lstm.to_json()
with open("Models/model_lstm.json", "w") as json_file:
    json_file.write(model_lstm_json)
#model_lstm.load_weights("Weights/model_lstm.h5")

In [22]:
# load json and create model
from keras.models import model_from_json
json_file = open('Models/model_lstm.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Weights/model_lstm.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Loaded model from disk


In [23]:
GetMetrics(loaded_model, test_x,test_y)

auc,acc,mcc 0.8825778244876511 0.8020089285714286 0.5974279725609833
              precision    recall  f1-score   support

           0       0.87      0.79      0.83      2750
           1       0.71      0.82      0.76      1730

   micro avg       0.80      0.80      0.80      4480
   macro avg       0.79      0.80      0.80      4480
weighted avg       0.81      0.80      0.80      4480



## Build neural network with LSTM and CNN

The LSTM model worked well. However, it takes forever to train three epochs. One way to speed up the training time is to improve the network adding “Convolutional” layer. Convolutional Neural Networks (CNN) come from image processing. They pass a “filter” over the data and calculate a higher-level representation. They have been shown to work surprisingly well for text, even though they have none of the sequence processing ability of LSTMs.

def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=train_x.shape[1]))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [24]:
'''
model_conv = create_conv_model()
model_conv.fit(train_x, np.array(train_y), validation_split=0.4, epochs = 3)
model_conv.save("Weights/model_conv.h5")
model_conv_json = model_conv.to_json()
with open("Models/model_conv.json", "w") as json_file:
    json_file.write(model_conv_json)
#model_conv.load_weights("Weights/model_conv.h5")
'''

'\nmodel_conv = create_conv_model()\nmodel_conv.fit(train_x, np.array(train_y), validation_split=0.4, epochs = 3)\nmodel_conv.save("Weights/model_conv.h5")\nmodel_conv_json = model_conv.to_json()\nwith open("Models/model_conv.json", "w") as json_file:\n    json_file.write(model_conv_json)\n#model_conv.load_weights("Weights/model_conv.h5")\n'

Code to load the model and weights:

In [25]:
# load json and create model
json_file = open('Models/model_conv.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Weights/model_conv.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Loaded model from disk


In [26]:
GetMetrics(loaded_model, test_x,test_y)

auc,acc,mcc 0.8934513925380976 0.821875 0.6185393923119542
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      2750
           1       0.81      0.70      0.75      1730

   micro avg       0.82      0.82      0.82      4480
   macro avg       0.82      0.80      0.81      4480
weighted avg       0.82      0.82      0.82      4480



## Use pre-trained Glove word embeddings
In this subsection, I want to use word embeddings from pre-trained Glove. It was trained on a dataset of one billion tokens (words) with a vocabulary of 400 thousand words. The glove has embedding vector sizes, including 50, 100, 200 and 300 dimensions. I chose the 100-dimensional version. I also want to see the model behavior in case the learned word weights do not get updated. I, therefore, set the trainable attribute for the model to be False.

In [27]:
embeddings_index = dict()
f = open('Glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=train_x.shape[1], weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_glove.fit(train_x, np.array(train_y), validation_split=0.4, epochs = 3)
model_glove.save("Weights/model_glove.h5")
model_glove_json = model_glove.to_json()
with open("Models/model_glove.json", "w") as json_file:
    json_file.write(model_glove_json)
#model_glove.load_weights("Weights/model_glove.h5")

In [28]:
# load json and create model
json_file = open('Models/model_glove.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("Weights/model_glove.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Loaded model from disk


In [29]:
GetMetrics(loaded_model, test_x,test_y)

auc,acc,mcc 0.8109889647924331 0.7158482142857143 0.44726337925398973
              precision    recall  f1-score   support

           0       0.84      0.67      0.74      2750
           1       0.60      0.79      0.68      1730

   micro avg       0.72      0.72      0.72      4480
   macro avg       0.72      0.73      0.71      4480
weighted avg       0.74      0.72      0.72      4480



## Word embedding visualization
In this subsection, I want to visualize word embedding weights obtained from trained models. Word embeddings with 100 dimensions are first reduced to 2 dimensions using t-SNE. Tensorflow has an excellent tool to visualize the embeddings in a great way, but here I just want to visualize the word relationship.

In [30]:
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= word_list[start:stop:step]
    )
    layout = dict(title= 't-SNE 1 vs t-SNE 2',
                  yaxis = dict(title='t-SNE 2'),
                  xaxis = dict(title='t-SNE 1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [31]:
word_list = []
for word, i in tokenizer.word_index.items():
    word_list.append(word)
lstm_embds = model_lstm.layers[0].get_weights()[0]
conv_embds = model_conv.layers[0].get_weights()[0]
glove_emds = model_glove.layers[0].get_weights()[0]

NameError: name 'model_lstm' is not defined

In [None]:
number_of_words = 2000
lstm_tsne_embds = TSNE(n_components=2).fit_transform(lstm_embds)
plot_words(lstm_tsne_embds, 0, number_of_words, 1)

In [None]:
conv_tsne_embds = TSNE(n_components=2).fit_transform(conv_embds)
plot_words(conv_tsne_embds, 0, number_of_words, 1)

In [None]:
glove_tsne_embds = TSNE(n_components=2).fit_transform(glove_emds)
plot_words(glove_tsne_embds, 0, number_of_words, 1)