In [0]:
import numpy as np 
import pandas as pd 
import os

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [0]:
# The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
# statistical natural language processing for English written in the Python programming language.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import GridSearchCV


#TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks.
from tqdm import tqdm

In [0]:
from keras.utils import to_categorical
import random
from tensorflow import set_random_seed
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential

Using TensorFlow backend.


In [0]:
train= pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [0]:
train.shape

(5279, 4)

In [0]:
test.head()

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara


In [0]:
test.shape

(2924, 3)

In [0]:
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['text']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [0]:
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(len(train_sentences))
print(len(test_sentences))

100%|██████████| 5279/5279 [00:19<00:00, 276.28it/s]
100%|██████████| 2924/2924 [00:11<00:00, 265.11it/s]

5279
2924





In [0]:
train_sentences[:5]

[['autoimmune',
  'disease',
  'tend',
  'to',
  'come',
  'in',
  'cluster',
  'a',
  'for',
  'gilenya',
  'if',
  'you',
  'feel',
  'good',
  'don',
  't',
  'think',
  'about',
  'it',
  'it',
  'won',
  't',
  'change',
  'anything',
  'but',
  'waste',
  'your',
  'time',
  'and',
  'energy',
  'i',
  'm',
  'taking',
  'tysabri',
  'and',
  'feel',
  'amazing',
  'no',
  'symptom',
  'other',
  'than',
  'dodgy',
  'color',
  'vision',
  'but',
  'i',
  've',
  'had',
  'it',
  'since',
  'always',
  'so',
  'don',
  't',
  'know',
  'and',
  'i',
  'don',
  't',
  'know',
  'if',
  'it',
  'will',
  'last',
  'a',
  'month',
  'a',
  'year',
  'a',
  'decade',
  'ive',
  'just',
  'decided',
  'to',
  'enjoy',
  'the',
  'ride',
  'no',
  'point',
  'in',
  'worrying'],
 ['i',
  'can',
  'completely',
  'understand',
  'why',
  'you',
  'd',
  'want',
  'to',
  'try',
  'it',
  'but',
  'result',
  'reported',
  'in',
  'lecture',
  'don',
  't',
  'always',
  'stand',
  'up',

In [0]:
#nltk.download('punkt')

In [0]:
#nltk.download('wordnet')

In [0]:
target=train.sentiment.values
y_target=to_categorical(target)
num_classes=y_target.shape[1]

In [0]:
y_target.shape[1]

3

In [0]:
X_train,X_val,y_train,y_val=train_test_split(train_sentences,y_target,test_size=0.2,stratify=y_target)

In [0]:
len(X_train)

4083

In [0]:
y_train[:10]

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [0]:
#It is needed for initializing tokenizer of keras and subsequent padding

unique_words = set()
len_max = 0

for i,sent in tqdm(enumerate(X_train)):
    
    unique_words.update(sent)
    
    if(len_max<len(sent) and len(sent)<=1713):
        len_max = len(sent)
        print(len_max)
    elif len(sent)>1713:
      del X_train[i]
      y_train = np.delete(y_train, i, axis = 0)
        
#length of the list of unique_words gives the no of unique words
print(len(list(unique_words)))
print(len_max)

4083it [00:00, 40760.26it/s]

87
1061
1280
1385
1475
1491
1571
1583
1649
1698
1713
30378
1713





In [0]:
type(y_train)

numpy.ndarray

In [0]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

#texts_to_sequences(texts)

    # Arguments- texts: list of texts to turn to sequences.
    #Return: list of sequences (one per text input).
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

#padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
#Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.

X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape,X_test.shape)

(4083, 1713) (1056, 1713) (2924, 1713)


In [0]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

In [0]:
#Model using Keras LSTM

#Multilayer Perceptron (MLP) for multi-class softmax classification:
#Let’s build what’s probably the most popular type of model in NLP at the moment: Long Short Term Memory network. 
#This architecture is specially designed to work on sequence data.
#It fits perfectly for many NLP tasks like tagging and text classification.
#It treats the text as a sequence rather than a bag of words or as ngrams.

#Here’s a possible model definition:

model=Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1713, 300)         9113400   
_________________________________________________________________
lstm_5 (LSTM)                (None, 1713, 128)         219648    
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_7 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
__________

In [0]:
#This is done for learning purpose only. One can play around with different hyper parameters combinations
#and try increase the accuracy even more. For example, a different learning rate, an extra dense layer 
# before output layer, etc. Cross validation could be used to evaluate the model and grid search 
# further to find unique combination of parameters that give maximum accuracy. This model has a validation
#accuracy of around 66.5%
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=15, batch_size=256, verbose=1)

Train on 4083 samples, validate on 1056 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [0]:
y_pred=model.predict_classes(X_test)

In [0]:
type(y_pred)

numpy.ndarray

In [0]:
sub_file = pd.read_csv('sample_submission.csv')
sub_file.sentiment=y_pred
sub_file.to_csv('Deep_Submission.csv',index=False)
sub_file.head()

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,2
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,1
2,50b6d851bcff4f35afe354937949e9948975adf7,2
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,2
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,2


## GRID SEARCH CV

In [0]:
from keras.wrappers.scikit_learn import KerasClassifier

In [0]:
def create_model(epochs=2, batch_size=256,optimizer='SGD',learn_rate=0.001,activation='softmax'):
  model=Sequential()
  model.add(Embedding(len(list(unique_words)),50,input_length=len_max))
  model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
  model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
  model.add(Dense(100,activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes,activation='softmax'))
  model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
  return model


In [0]:
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10)

In [0]:
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
epochs = [6]
batch_size = [256]
param_grid = dict(epochs = epochs, batch_size = batch_size, optimizer = optimizer, learn_rate = learn_rate, activation = activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring = 'accuracy')
grid_result = grid.fit(X_train[:1000], y_train[:1000])



Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
