# Quora Duplicate Question Detection: Parallel Encoder Architecture
This model simultaneously trains a distributed word embedding and an LSTM encoder for sequences from that embedding. It uses this embedding encoder to encode two Quora question titles into fixed-length vectors and then uses a two-layer neural network with batch normalization and dropout to classify the pair as duplicate or non-duplicate.

Currently, this performs slightly better than the bag-of-words featurization.

In [66]:
%matplotlib inline
import cPickle as pickle
import heapq
import matplotlib.pyplot as plt
import numpy as np
import re
import sys
import tensorflow as tf

from collections import defaultdict

In [110]:
# Hyperparameters

SEQ_LEN = 20
EMBED_DIM = 100
ENCODE_DIM = 128
DROPOUT_PARAM = 0.5
H_DIM = 256

# Training parameters

LR = 1e-3
BATCH_SIZE = 256
EPOCHS = 3

In [100]:
def rawDataGen(fname):
    """Generator over rows from the raw Quora dataset file. Yields dictionaries with keys
    defined by the headers in the first row."""
    with open(fname) as f:
        fields = f.readline().strip().split("\t")
        line = f.readline()
        while line != "":
            yield {field:attr for field, attr in zip(fields, line.strip().split("\t"))}
            line = f.readline()

def preprocessText(text):
    """Currently converts to lower case, converts numbers to "#", separates out punctuation,
    consolidates multiple whitespaces to one, and splits into tokens."""
    out = text.lower()
    out = re.sub(r"\d+", "#", out)
    out = re.sub(r"([(\.\.\.)\.!?:;,])", r" \1 ", out)
    out = re.sub(r"\s+", " ", out)
    return out.split()

def preprocessData(inFname, maxDocs=None, outFname=None):
    """Read (up to maxDocs) lines from the file at inFname, preprocess the questions,
    and return (q1, q2, label) tuples in a list. Optionally save the list as a
    pickle file."""
    preprocessed = list()
    for doc in rawDataGen(inFname):
        try:
            q1 = preprocessText(doc["question1"])
            q2 = preprocessText(doc["question2"])
            label = int(doc["is_duplicate"])
            preprocessed.append((q1, q2, label))
        except:  # Some corrupted lines in the input
            pass
        if len(preprocessed) % 1000 == 0:
            sys.stdout.write("\rPreprocessed %d documents." % len(preprocessed))
            sys.stdout.flush()
        if len(preprocessed) == maxDocs:
            break
    sys.stdout.write("\n")
    
    if outFname:
        sys.stdout.write("Writing preprocessed data file... ")
        sys.stdout.flush()
        with open(outFname, "w") as outF:
            pickle.dump(preprocessed, outF)
        sys.stdout.write("done!\n")
    return preprocessed

def generateVocab(preprocessed, vocabSize=10000, vocabFname=None):
    """Read from the preprocessed questions and generate a vocab."""
    wordCounts = defaultdict(int)
    sys.stdout.write("Performing word count... ")
    sys.stdout.flush()
    for q1, q2, _ in preprocessed:
        for word in q1 + q2:
            wordCounts[word] += 1
    sys.stdout.write("done!\n")
    
    sys.stdout.write("Building vocab... ")
    sys.stdout.flush()
    wordCountsList = list(wordCounts.iteritems())
    topNWordCounts = heapq.nlargest(vocabSize, wordCountsList, key=lambda wc: wc[1])
    topNWords = [word for word, _ in topNWordCounts]
    allVocabWords = ["PAD", "UNK"] + topNWords
    vocab = {word:idx for idx, word in enumerate(allVocabWords)}
    sys.stdout.write("done!\n")
    
    if vocabFname:
        sys.stdout.write("Writing vocab file... ")
        sys.stdout.flush()
        with open(vocabFname, "w") as vocabF:
            pickle.dump(vocab, vocabF)
        sys.stdout.write("done!\n")
    return vocab

def indexSequence(tokens, vocab):
    """Given a list of tokens and a vocab, return a list of vocab indices corresponding
    to the tokens."""
    return [vocab[tok] if tok in vocab.keys() else vocab["UNK"] for tok in tokens]

def indexDataset(preprocessed, vocab, indexedFname=None):
    """Indexes the questions in the entire dataset using the provided vocab. Optionally
    writes the output to file."""
    indexed = list()
    for q1, q2, label in preprocessed:
        indexedSample = (indexSequence(q1, vocab), indexSequence(q2, vocab), label)
        indexed.append(indexedSample)
        if len(indexed) % 100 == 0:
            sys.stdout.write("\rIndexed %d documents." % len(indexed))
            sys.stdout.flush()
    sys.stdout.write("\n")
    
    if indexedFname:
        sys.stdout.write("Writing indexed data file... ")
        sys.stdout.flush()
        with open(indexedFname, "w") as indexedF:
            pickle.dump(indexed, indexedF)
        sys.stdout.write("done!\n")
    return indexed
        
def prepareArrays(indexed, seqLen, padValue, oneHotLabels=True, dataFname=None):
    """Align all questions to seqLen tokens using padValue to pad and truncating
    where necessary and then return three numpy arrays: q1, q2, and labels.
    Labels is one-hot by default, or a single vector of 0s and 1s if
    oneHotLabels is False. Optionally writes the arrays to a numpy NPZ file."""
    q1List, q2List, labelsList = list(), list(), list()
    for q1Indexed, q2Indexed, label in indexed:
        q1Aligned = q1Indexed[:seqLen] + [padValue] * (seqLen - len(q1Indexed))
        q2Aligned = q2Indexed[:seqLen] + [padValue] * (seqLen - len(q2Indexed))
        q1List.append(q1Aligned)
        q2List.append(q2Aligned)
        labelsList.append([int(not label), int(label)] if oneHotLabels else label)
        if len(q1List) % 1000 == 0:
            sys.stdout.write("\rPrepared %d documents for data arrays." % len(q1List))
            sys.stdout.flush()
            
    sys.stdout.write("\nConstructing arrays... ")
    sys.stdout.flush()
    q1, q2, labels = np.array(q1List), np.array(q2List), np.array(labelsList)
    sys.stdout.write("done! Q1: %s; Q2: %s; labels: %s\n" % (q1.shape, q2.shape, labels.shape))
    
    if dataFname:
        sys.stdout.write("Writing final data file... ")
        sys.stdout.flush()
        with open(dataFname, "w") as dataF:
            np.savez(dataF,
                     q1=q1,
                     q2=q2,
                     labels=labels)
        sys.stdout.write("done!\n")
    return q1, q2, labels

In [101]:
# Data pipeline

preprocessed = preprocessData("quora_duplicate_questions.tsv", outFname="data/preprocessed.pkl")
vocab = generateVocab(preprocessed, vocabFname="data/vocab.pkl")
indexed = indexDataset(preprocessed, vocab, indexedFname="data/indexed.pkl")
q1, q2, labels = prepareArrays(indexed, SEQ_LEN, vocab["PAD"], dataFname="data/data.npz")

Preprocessed 404000 documents.
Writing preprocessed data file... done!
Performing word count... done!
Building vocab... done!
Writing vocab file... done!
Indexed 404300 documents.
Writing indexed data file... done!
Prepared 404000 documents for data arrays.
Constructing arrays... done! Q1: (404340, 20); Q2: (404340, 20); labels: (404340, 2)
Writing final data file... done!


The detailed architecture is as follows:

* Inputs: two 20-length sequences of vocabulary indices
* 200-dimension word embedding
* Bidirectional LSTM-128 encoder
* Concatenation of encoded vectors
* Batch normalization
* FC-256 + ReLU hidden layer
* Dropout w/ probability 0.5
* FC-2 + softmax readout layer

In [111]:
# Model definition and training
# We use Keras' functional API to enable sharing of the embedding layer and BLSTM encoder between
# the two questions.

from keras.layers import Input, Embedding, Bidirectional, LSTM, merge, Dense, Dropout, BatchNormalization
from keras.models import Model
from keras.optimizers import Adam

q1Input = Input(shape=(SEQ_LEN,))
q2Input = Input(shape=(SEQ_LEN,))

embed = Embedding(len(vocab), EMBED_DIM, input_length=SEQ_LEN)
q1Embed = embed(q1Input)
q2Embed = embed(q2Input)

encoder = Bidirectional(LSTM(ENCODE_DIM))
q1Encoded = encoder(q1Embed)
q2Encoded = encoder(q2Embed)

merged = merge([q1Encoded, q2Encoded], mode='concat', concat_axis=-1)
mergedBN = BatchNormalization()(merged)

h = Dense(H_DIM, activation="relu")(mergedBN)
hDrop = Dropout(DROPOUT_PARAM)(h)
preds = Dense(2, activation="softmax")(hDrop)

model = Model(input=[q1Input, q2Input], output=preds)
model.compile(optimizer=Adam(LR), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit([q1, q2], labels, nb_epoch=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.05)

Train on 384123 samples, validate on 20217 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19acb5550>