In [1]:
import numpy as np
import pandas as pd

## Read Data ##

In [2]:
df_train = pd.read_csv('../../data/train.csv', encoding='utf-8')
df_train['id'] = df_train['id'].apply(str)

In [3]:
df_test = pd.read_csv('../../data/test.csv', encoding='utf-8')
df_test['test_id'] = df_test['test_id'].apply(str)

In [4]:
df_all = pd.concat((df_train, df_test))
df_all['question1'].fillna('', inplace=True)
df_all['question2'].fillna('', inplace=True)

## Create Vocab ##

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [6]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(df_all['question1'], df_all['question2']))
other_index = len(counts_vectorizer.vocabulary_)

##Prep Data##

In [7]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [8]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [9]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(lambda s: 
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

In [10]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(create_padded_seqs(df_all[df_all['id'].notnull()]['question1']), 
                     create_padded_seqs(df_all[df_all['id'].notnull()]['question2']),
                     df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     stratify=df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     test_size=0.3, random_state=1989)

In [17]:
X1_train

array([[   0, 9789, 4792, ..., 6308, 5987, 4766],
       [   0,    0,    0, ..., 6026, 6517, 4127],
       [   0,    0,    0, ..., 6459, 8151, 3860],
       ..., 
       [   0,    0,    0, ..., 3536, 1155, 3731],
       [   0,    0, 2804, ...,  295, 2646, 6319],
       [   0, 9789, 8985, ..., 7989, 4519, 1572]])

##Training##

In [11]:
import keras.layers as lyr
from keras.models import Model

In [12]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 10)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 10)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 10, 100)       1000000     input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 256)           365568      embedding_1[0][0]       

In [13]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

Train on 283003 samples, validate on 121287 samples
Epoch 1/6
149s - loss: 0.5227 - val_loss: 0.4820
Epoch 2/6
141s - loss: 0.4457 - val_loss: 0.4520
Epoch 3/6
142s - loss: 0.3963 - val_loss: 0.4345
Epoch 4/6
142s - loss: 0.3539 - val_loss: 0.4306
Epoch 5/6
142s - loss: 0.3144 - val_loss: 0.4405
Epoch 6/6
142s - loss: 0.2765 - val_loss: 0.4506


<keras.callbacks.History at 0x26adb938780>

##Extract Features From Model##

In [14]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [15]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

In [16]:
F_train

array([[  0.00000000e+00,   3.63397156e-03,   7.08418250e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,  -6.80528535e-03,   1.60586804e-01, ...,
          0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00],
       [  0.00000000e+00,   4.35364753e-04,  -1.31063564e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  2.88876152e-04,   5.03762249e-06,   7.65537173e-02, ...,
          0.00000000e+00,   1.04935067e-02,   4.71994560e-03],
       [  0.00000000e+00,  -1.22844256e-04,  -6.46804273e-01, ...,
         -0.00000000e+00,  -0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   3.02577575e-07,  -5.11572957e-02, ...,
          0.00000000e+00,  -0.00000000e+00,   0.00000000e+00]], dtype=float32)

##Train XGBoost##

In [19]:
import xgboost as xgb



In [20]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

In [21]:
xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

[0]	train-logloss:0.649218	val-logloss:0.656476
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[10]	train-logloss:0.432129	val-logloss:0.491124
[20]	train-logloss:0.349711	val-logloss:0.445954
[30]	train-logloss:0.30302	val-logloss:0.429475
[40]	train-logloss:0.279071	val-logloss:0.426231
[50]	train-logloss:0.258509	val-logloss:0.426764
Stopping. Best iteration:
[45]	train-logloss:0.268251	val-logloss:0.42574



##Predict Test##

In [22]:
X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

In [23]:
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [None]:
F_test

In [None]:
dTest = xgb.DMatrix(F_test)

OSError: exception: access violation reading 0x000000E6A11F0000

In [31]:
df_sub = pd.DataFrame({
        'id': df_all[df_all['id'].notnull()]['id'].values,
        'is_duplicate': df_all[df_all['is_duplicate'].notnull()]['is_duplicate'].values
    })#.set_index('id')

In [32]:
df_sub

Unnamed: 0,id,is_duplicate
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
5,5,1.0
6,6,0.0
7,7,1.0
8,8,0.0
9,9,0.0


In [27]:
df_sub = pd.DataFrame({
        'test_id': df_all[df_all['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')

NameError: name 'dTest' is not defined

In [None]:
df_sub.head()

In [None]:
df_sub['is_duplicate'].hist(bins=100)