# Load data

In [0]:
import numpy as np
import pandas as pd
import datetime, time, json
from keras.models import Model
from keras.layers import Input, Bidirectional, LSTM, dot, Flatten, Dense, Reshape, add, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

%matplotlib inline

Using TensorFlow backend.


In [0]:
# colab configuration

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# load the preprocessed data, embedding matrix and word count

q1_data = np.load(open('/content/drive/My Drive/NLP/Data/q1_train.npy', 'rb'))
q2_data = np.load(open('/content/drive/My Drive/NLP/Data/q2_train.npy', 'rb'))
labels = np.load(open('/content/drive/My Drive/NLP/Data/label_train.npy', 'rb'))
word_embedding_matrix = np.load(open('/content/drive/My Drive/NLP/Data/word_embedding_matrix.npy', 'rb'))
with open('/content/drive/My Drive/NLP/Data/nb_words.json', 'r') as f:
    nb_words = json.load(f)['nb_words']

In [0]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

# Without Batchnormalization

In [0]:
question1 = Input(shape=(25,))
question2 = Input(shape=(25,))

q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question1)

q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question2)

q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((25*128))(attention)
attention = Reshape((25, 128))(attention)

merged = add([q1,attention, q2])
merged = Flatten()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dense(200, activation='relu')(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model_wb = Model(inputs=[question1,question2], outputs=is_duplicate)
model_wb.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/NLP/weights/model_wb.h5',
                             monitor='val_acc',
                             save_best_only=True)]

history_wb = model_wb.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_split=0.1,
                    verbose=2,
                    batch_size=516,
                    callbacks=callbacks)



Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 134s - loss: 0.5043 - acc: 0.7493 - val_loss: 0.4384 - val_acc: 0.7885
Epoch 2/25
 - 131s - loss: 0.4111 - acc: 0.8010 - val_loss: 0.4162 - val_acc: 0.7967
Epoch 3/25
 - 133s - loss: 0.3601 - acc: 0.8304 - val_loss: 0.4064 - val_acc: 0.8090
Epoch 4/25
 - 132s - loss: 0.3115 - acc: 0.8575 - val_loss: 0.4066 - val_acc: 0.8149
Epoch 5/25
 - 131s - loss: 0.2657 - acc: 0.8817 - val_loss: 0.4416 - val_acc: 0.8185
Epoch 6/25
 - 131s - loss: 0.2210 - acc: 0.9043 - val_loss: 0.4487 - val_acc: 0.8217
Epoch 7/25
 - 131s - loss: 0.1805 - acc: 0.9238 - val_loss: 0.4928 - val_acc: 0.8207
Epoch 8/25
 - 128s - loss: 0.1483 - acc: 0.9395 - val_loss: 0.5713 - val_acc: 0.8249
Epoch 9/25
 - 130s - loss: 0.1215 - acc: 0.9523 - val_loss: 0.6015 - val_acc: 0.8233
Epoch 10/25
 - 128s - loss: 0.0997 - acc: 0.9611 - val_loss: 0.6468 - val_acc: 0.8218
Epoch 11/25
 - 128s - loss: 0.0847 - acc: 0.9674 - val_loss: 0.7843 - val_acc: 0.8232
Epoch 12/2

# With Dropout 0.5

In [0]:
question1 = Input(shape=(25,))
question2 = Input(shape=(25,))

q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question1)

q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question2)

q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((25*128))(attention)
attention = Reshape((25, 128))(attention)

merged = add([q1,attention, q2])
merged = Flatten()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model_drop_5 = Model(inputs=[question1,question2], outputs=is_duplicate)
model_drop_5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/NLP/weights/model_drop_5.h5',
                             monitor='val_acc',
                             save_best_only=True)]

history_drop_5 = model_drop_5.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_split=0.1,
                    verbose=2,
                    batch_size=516,
                    callbacks=callbacks)

Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 142s - loss: 0.6113 - acc: 0.6608 - val_loss: 0.6393 - val_acc: 0.6112
Epoch 2/25
 - 135s - loss: 0.4979 - acc: 0.7530 - val_loss: 0.4842 - val_acc: 0.7601
Epoch 3/25
 - 135s - loss: 0.4400 - acc: 0.7881 - val_loss: 0.4355 - val_acc: 0.7914
Epoch 4/25
 - 135s - loss: 0.3983 - acc: 0.8116 - val_loss: 0.4134 - val_acc: 0.8016
Epoch 5/25
 - 135s - loss: 0.3597 - acc: 0.8346 - val_loss: 0.3972 - val_acc: 0.8119
Epoch 6/25
 - 135s - loss: 0.3216 - acc: 0.8547 - val_loss: 0.3948 - val_acc: 0.8204
Epoch 7/25
 - 135s - loss: 0.2826 - acc: 0.8755 - val_loss: 0.4169 - val_acc: 0.8185
Epoch 8/25
 - 132s - loss: 0.2455 - acc: 0.8954 - val_loss: 0.4307 - val_acc: 0.8193
Epoch 9/25
 - 133s - loss: 0.2104 - acc: 0.9125 - val_loss: 0.4504 - val_acc: 0.8238
Epoch 10/25
 - 137s - loss: 0.1795 - acc: 0.9276 - val_loss: 0.4959 - val_acc: 0.8258
Epoch 11/25
 - 137s - loss: 0.1528 - acc: 0.9395 - val_loss: 0.5418 - val_acc: 0.8268
Epoch 12/25


# With Dense 50

In [0]:
question1 = Input(shape=(25,))
question2 = Input(shape=(25,))

q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question1)

q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question2)

q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((25*128))(attention)
attention = Reshape((25, 128))(attention)

merged = add([q1,attention, q2])
merged = Flatten()(merged)
merged = Dense(50, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(50, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(50, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(50, activation='relu')(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model_dense_50 = Model(inputs=[question1,question2], outputs=is_duplicate)
model_dense_50.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/NLP/weights/model_dense_50.h5',
                             monitor='val_acc',
                             save_best_only=True)]

history_dense_50 = model_dense_50.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_split=0.1,
                    verbose=2,
                    batch_size=516,
                    callbacks=callbacks)

Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 144s - loss: 0.5200 - acc: 0.7326 - val_loss: 0.4891 - val_acc: 0.7691
Epoch 2/25
 - 139s - loss: 0.4419 - acc: 0.7832 - val_loss: 0.4368 - val_acc: 0.7849
Epoch 3/25
 - 140s - loss: 0.3983 - acc: 0.8084 - val_loss: 0.4158 - val_acc: 0.7982
Epoch 4/25
 - 142s - loss: 0.3592 - acc: 0.8315 - val_loss: 0.4007 - val_acc: 0.8097
Epoch 5/25
 - 139s - loss: 0.3190 - acc: 0.8534 - val_loss: 0.4327 - val_acc: 0.8044
Epoch 6/25
 - 137s - loss: 0.2782 - acc: 0.8757 - val_loss: 0.4063 - val_acc: 0.8194
Epoch 7/25
 - 137s - loss: 0.2368 - acc: 0.8969 - val_loss: 0.4276 - val_acc: 0.8216
Epoch 8/25
 - 136s - loss: 0.1976 - acc: 0.9163 - val_loss: 0.4571 - val_acc: 0.8206
Epoch 9/25
 - 132s - loss: 0.1618 - acc: 0.9330 - val_loss: 0.4846 - val_acc: 0.8246
Epoch 10/25
 - 134s - loss: 0.1333 - acc: 0.9461 - val_loss: 0.5372 - val_acc: 0.8192
Epoch 11/25
 - 132s - loss: 0.1084 - acc: 0.9570 - val_loss: 0.5961 - val_acc: 0.8250
Epoch 12/25


# With Dense 300

In [0]:
question1 = Input(shape=(25,))
question2 = Input(shape=(25,))

q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question1)

q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=25, 
                 trainable=False)(question2)

q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((25*128))(attention)
attention = Reshape((25, 128))(attention)

merged = add([q1,attention, q2])
merged = Flatten()(merged)
merged = Dense(300, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(300, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(300, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(300, activation='relu')(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model_dense_300 = Model(inputs=[question1,question2], outputs=is_duplicate)
model_dense_300.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/NLP/weights/model_dense_300.h5',
                             monitor='val_acc',
                             save_best_only=True)]

history_dense_300 = model_dense_300.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_split=0.1,
                    verbose=2,
                    batch_size=516,
                    callbacks=callbacks)

Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 138s - loss: 0.5195 - acc: 0.7344 - val_loss: 0.4694 - val_acc: 0.7707
Epoch 2/25
 - 134s - loss: 0.4373 - acc: 0.7853 - val_loss: 0.4321 - val_acc: 0.7875
Epoch 3/25
 - 133s - loss: 0.3888 - acc: 0.8139 - val_loss: 0.4105 - val_acc: 0.8012
Epoch 4/25
 - 134s - loss: 0.3455 - acc: 0.8381 - val_loss: 0.4114 - val_acc: 0.8062
Epoch 5/25
 - 133s - loss: 0.3027 - acc: 0.8627 - val_loss: 0.4043 - val_acc: 0.8132
Epoch 6/25
 - 134s - loss: 0.2579 - acc: 0.8861 - val_loss: 0.4296 - val_acc: 0.8159
Epoch 7/25
 - 134s - loss: 0.2153 - acc: 0.9074 - val_loss: 0.4281 - val_acc: 0.8222
Epoch 8/25
 - 134s - loss: 0.1775 - acc: 0.9255 - val_loss: 0.4858 - val_acc: 0.8260
Epoch 9/25
 - 135s - loss: 0.1446 - acc: 0.9406 - val_loss: 0.5519 - val_acc: 0.8208
Epoch 10/25
 - 133s - loss: 0.1173 - acc: 0.9532 - val_loss: 0.5747 - val_acc: 0.8210
Epoch 11/25
 - 134s - loss: 0.0965 - acc: 0.9617 - val_loss: 0.6617 - val_acc: 0.8236
Epoch 12/25


# Max_length representation as 40

In [0]:
# adjust the preprocessing

# set drive path
import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

In [0]:
path = "/content/drive/My Drive/NLP/Data/train.csv"
df = pd.read_csv(path).fillna("")

In [0]:
question1 = [i for i in df['question1']]
question2 = [i for i in df['question2']]

questions = question1 + question2
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(questions)


question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)


q1_data = pad_sequences(question1_word_sequences, maxlen=40)
q2_data = pad_sequences(question2_word_sequences, maxlen=40)

np.save(open('/content/drive/My Drive/NLP/Data/q1_train_40.npy', 'wb'), q1_data)
np.save(open('/content/drive/My Drive/NLP/Data/q2_train_40.npy', 'wb'), q2_data)

In [0]:
q1_data = np.load(open('/content/drive/My Drive/NLP/Data/q1_train_40.npy', 'rb'))
q2_data = np.load(open('/content/drive/My Drive/NLP/Data/q2_train_40.npy', 'rb'))

In [0]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [0]:
question1 = Input(shape=(40,))
question2 = Input(shape=(40,))

q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=40, 
                 trainable=False)(question1)

q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=40, 
                 trainable=False)(question2)

q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((40*128))(attention)
attention = Reshape((40, 128))(attention)

merged = add([q1,attention, q2])
merged = Flatten()(merged)
merged = Dense(200, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model_max_40 = Model(inputs=[question1,question2], outputs=is_duplicate)
model_max_40.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
callbacks = [ModelCheckpoint('/content/drive/My Drive/NLP/weights/model_max_40.h5',
                             monitor='val_acc',
                             save_best_only=True)]

history_max_40 = model_max_40.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_split=0.1,
                    verbose=2,
                    batch_size=516,
                    callbacks=callbacks)

Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 225s - loss: 0.5137 - acc: 0.7366 - val_loss: 0.4541 - val_acc: 0.7762
Epoch 2/25
 - 219s - loss: 0.4293 - acc: 0.7901 - val_loss: 0.4224 - val_acc: 0.7934
Epoch 3/25
 - 217s - loss: 0.3820 - acc: 0.8186 - val_loss: 0.4159 - val_acc: 0.8100
Epoch 4/25
 - 219s - loss: 0.3363 - acc: 0.8444 - val_loss: 0.4000 - val_acc: 0.8103
Epoch 5/25
 - 218s - loss: 0.2907 - acc: 0.8690 - val_loss: 0.4162 - val_acc: 0.8132
Epoch 6/25
 - 219s - loss: 0.2443 - acc: 0.8933 - val_loss: 0.4379 - val_acc: 0.8184
Epoch 7/25
 - 219s - loss: 0.2004 - acc: 0.9148 - val_loss: 0.4780 - val_acc: 0.8198
Epoch 8/25
 - 220s - loss: 0.1618 - acc: 0.9328 - val_loss: 0.5088 - val_acc: 0.8186
Epoch 9/25
 - 215s - loss: 0.1310 - acc: 0.9468 - val_loss: 0.5489 - val_acc: 0.8213
Epoch 10/25
 - 219s - loss: 0.1057 - acc: 0.9583 - val_loss: 0.6226 - val_acc: 0.8187
Epoch 11/25
 - 217s - loss: 0.0868 - acc: 0.9665 - val_loss: 0.6086 - val_acc: 0.8228
Epoch 12/25


In [0]:
# print best validation accuracy and epoch

max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history_max_40.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 19 = 0.8274
