In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Data/quora_duplicate_questions/train.csv')

In [3]:
print("df.shape = {}".format(df.shape))
print("columns = {}".format(df.columns.tolist()))

df.shape = (404290, 6)
columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']


In [4]:
"""
    How many pairs are labeled as duplicate ?
"""
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [5]:
df.columns.tolist()

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']

In [6]:
"""
    How many unique question ids ?
    How many in the intersection of qid1 and qid2 ?
    How many in the union of qid1 and qid2 ?
"""
print("num_unique_qid1 = {} num_unique_qid2 = {}".format(df['qid1'].nunique(), df['qid2'].nunique()))
print("num common qids = {}".format(len(set(df['qid1']).intersection(set(df['qid2'])))))
print("total num_unique_qids = {}".format(len(set(df['qid1']).union(set(df['qid2'])))))

num_unique_qid1 = 290654 num_unique_qid2 = 299364
num common qids = 52085
total num_unique_qids = 537933


In [7]:
df = df.dropna()

In [8]:
df['text'] = df['question1'] + " " + df['question2']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [10]:
count_vec = CountVectorizer(max_features=10000-1).fit(df['text'])

other_index = len(count_vec.vocabulary_)

In [11]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [12]:
words_tokenizer = re.compile(count_vec.token_pattern)

In [13]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(lambda s: 
        [count_vec.vocabulary_[w] if w in count_vec.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

In [14]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(create_padded_seqs(df['question1']), 
                     create_padded_seqs(df['question2']),
                     df['is_duplicate'].values,
                     stratify=df['is_duplicate'].values,
                     test_size=0.3, random_state=1989)


In [17]:
import keras.layers as layer
from keras.models import Model

input1_tensor = layer.Input(X1_train.shape[1:])
input2_tensor = layer.Input(X2_train.shape[1:])

words_embedding_layer = layer.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = layer.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = layer.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = layer.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = layer.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 10, 100)      1000000     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 256)          365568      embedding_2[0][0]                
          

In [18]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

Train on 283000 samples, validate on 121287 samples
Epoch 1/6
 - 289s - loss: 0.5081 - acc: 0.7541 - val_loss: 0.4718 - val_acc: 0.7777
Epoch 2/6
 - 284s - loss: 0.4309 - acc: 0.8009 - val_loss: 0.4371 - val_acc: 0.7978
Epoch 3/6
 - 285s - loss: 0.3798 - acc: 0.8290 - val_loss: 0.4261 - val_acc: 0.8070
Epoch 4/6
 - 284s - loss: 0.3346 - acc: 0.8536 - val_loss: 0.4207 - val_acc: 0.8134
Epoch 5/6
 - 282s - loss: 0.2920 - acc: 0.8754 - val_loss: 0.4291 - val_acc: 0.8178
Epoch 6/6
 - 282s - loss: 0.2513 - acc: 0.8960 - val_loss: 0.4459 - val_acc: 0.8185


<keras.callbacks.History at 0x7fafd3a784e0>

In [19]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [20]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

In [22]:
import xgboost as xgb

dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)



[0]	train-logloss:0.654243	val-logloss:0.660035
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[10]	train-logloss:0.430273	val-logloss:0.48743
[20]	train-logloss:0.348252	val-logloss:0.43799
[30]	train-logloss:0.295311	val-logloss:0.417279
[40]	train-logloss:0.266981	val-logloss:0.411906
[50]	train-logloss:0.241197	val-logloss:0.410289
[60]	train-logloss:0.224583	val-logloss:0.411725
Stopping. Best iteration:
[51]	train-logloss:0.239351	val-logloss:0.410122



In [None]:
X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

F_test = features_model.predict([X1_test, X2_test], batch_size=128)

dTest = xgb.DMatrix(F_test)

df_sub = pd.DataFrame({
        'test_id': df_all[df_all['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')