In [None]:
from __future__ import print_function
import numpy as np
import sys, csv, datetime, time, json
from os.path import expanduser, exists
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import * 
from keras.activations import softmax
from keras.layers.embeddings import Embedding
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.data_utils import get_file
from keras import backend as K
from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer
import re
import nltk
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [None]:
import urllib
urllib.request.urlretrieve("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", "quora_duplicate_questions.tsv")
import pandas as pd
data = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
list(data.columns)
#Remove samples with nan
import numpy as np
dataq1 = data['question1']
dataq2 = data['question2']

q1_nans = np.where(dataq1.isnull())[0]
q2_nans = np.where(dataq2.isnull())[0]
nan_indeces = np.concatenate([q1_nans,q2_nans])
print("Print NAN indices:",nan_indeces)

did = data['id']
data = data.drop(nan_indeces)
data = data[['question1', 'question2','is_duplicate']]
data.head

In [None]:
#global variables
MAX_NB_WORDS = 2000000
MAX_SEQUENCE_LENGTH = 100

In [None]:
question1 = []
question2 = []
is_duplicate = []
with open('quora_duplicate_questions.tsv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
        is_duplicate.append(row['is_duplicate'])
print('Question pairs: %d' % len(question1))


In [None]:
# Build tokenized word index
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
#fit_on_texts counts the occurrence of each word in the vocab
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index


In [None]:
embeddings_index = {}
#Prepare word embeddings dictionary
with open("glove.6B.300d.txt") as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding
print('Word embeddings: %d' % len(embeddings_index))

In [None]:
# Prepare word embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, 300)) #300-dimensions
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector
    else:
        word_embedding_matrix[i] = np.random.rand(1, 300)


In [None]:
# Prepare training data tensors
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
# Persist training and configuration data to files
np.save(open("Q1_training_data.npy", 'wb'), q1_data)
np.save(open("Q2_training_data.npy", 'wb'), q2_data)
np.save(open("label_training_data.npy", 'wb'), labels)
np.save(open("word_embedding_matrix.npy", 'wb'), word_embedding_matrix)
with open("nb_words.json", 'w') as f:
    json.dump({'nb_words': nb_words}, f)

In [None]:
sys.stdout.flush()
# Partition the dataset into train and test sets, train test split = 0.2
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

print("Q1_train.shape: ", Q1_train.shape)
print("Q2_train.shape :", Q2_train.shape)
print("Q1_test.shape: ", Q1_test.shape)
print("Q2_test.shape :", Q2_test.shape)


In [None]:
#helper function for definint attention model
def unchanged_shape(input_shape):
    return input_shape

In [None]:
# Define the model
question1 = Input(shape=(100,))
question2 = Input(shape=(100,))
q1 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=100, 
                 trainable=False)(question1)
print("q1 shape :", q1.shape)

#200 sentence embedding dimension
q1 = Bidirectional(LSTM(200, return_sequences=True))(q1)

print("q1 shape :", q1.shape)
q2 = Embedding(nb_words + 1, 
                 300, 
                 weights=[word_embedding_matrix], 
                 input_length=100, 
                 trainable=False)(question2)
print("q2 shape :", q2.shape)

q2 = Bidirectional(LSTM(200, return_sequences=True))(q2)

attention = Dot(axes=-1)([q1, q2])
w_att_1 = Lambda(lambda x: softmax(x, axis=1), output_shape=unchanged_shape)(attention)
w_att_2 = Permute((2,1))(Lambda(lambda x: softmax(x, axis=2), output_shape=unchanged_shape)(attention))
q1_aligned = Dot(axes=1)([w_att_1, q1])
q2_aligned = Dot(axes=1)([w_att_2, q2])

In [None]:
#Get multiplication and subtraction, concatenate results
def submult(i1, i2):
    mult = Multiply()([i1, i2])
    sub = substract(i1, i2)
    out= Concatenate()([sub, mult])
    return out

#Substract element-wise
def substract(i1, i2):
    negative_i2 = Lambda(lambda x: -x, output_shape=unchanged_shape)(i2)
    out = Add()([i1, negative_i2])
    return out

#Apply layers to input, concatenate result
def apply_multiple(original_input, layers):
    agg = []
    for layer in layers:
        agg.append(layer(original_input))
    out = Concatenate()(agg)
    return out

In [None]:
q1_combined = concatenate([q1, q2_aligned, submult(q1, q2_aligned)])
q2_combined = concatenate([q2, q1_aligned, submult(q2, q1_aligned)]) 
compose = Bidirectional(LSTM(200, return_sequences=True))
q1_compare = compose(q1_combined)
q2_compare = compose(q2_combined)

q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])
q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()])


In [None]:
merged = concatenate([q1_rep, q2_rep])
merged = BatchNormalization()(merged)
merged = Dense(1000, activation='elu')(merged)

merged = BatchNormalization()(merged)
merged = Dense(500, activation='elu')(merged)
merged = Dropout(0.1)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='elu')(merged)
merged = BatchNormalization()(merged)
merged = Dense(100, activation='elu')(merged)
merged = Dropout(0.1)(merged)
merged = BatchNormalization()(merged)

In [None]:
is_duplicate = Dense(1, activation='sigmoid')(merged)

In [None]:
def function(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [None]:
model = Model(inputs=[question1, question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', function])
print("Starting training at", datetime.datetime.now())
sys.stdout.flush()
t0 = time.time()
callbacks = [ModelCheckpoint("question_pairs_weights_attention", monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=20,
                    validation_split=0.2,
                    verbose=2,
                    batch_size=500,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

In [None]:
# Print best validation accuracy and associated epoch
max_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum validation accuracy = {0:.4f} (epoch {1:d})'.format(max_acc, idx+1))

# Evaluate the model with best validation accuracy on the test partition
model.load_weights("question_pairs_weights_attention")
loss, accuracy, function = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print(model.evaluate([Q1_test, Q2_test], y_test, verbose=0))
sys.stdout.flush()
