In [1]:
'''
Single model may achieve LB scores at around 0.29+ ~ 0.30+
Average ensembles can easily get 0.28+ or less
Don't need to be an expert of feature engineering
All you need is a GPU!!!!!!!

The code is tested on Keras 2.0.0 using Tensorflow backend, and Python 2.7

According to experiments by kagglers, Theano backend with GPU may give bad LB scores while
        the val_loss seems to be fine, so try Tensorflow backend first please
'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

########################################
## set directories and parameters
########################################
BASE_DIR = '../data/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'#自行下载
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

########################################
## process texts in datasets
########################################
print('Processing text dataset')

Indexing word vectors
Found 3000000 word vectors of word2vec
Processing text dataset


In [3]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [4]:
texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))#文本清洗，转换成词序列
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

Found 404290 texts in train.csv
Found 2345796 texts in test.csv


In [5]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)#分词器
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)#训练集+测试集所有词作为用以训练的文本列表

In [6]:
sequences_1 = tokenizer.texts_to_sequences(texts_1)#训练数据的question1集合=>向量
sequences_2 = tokenizer.texts_to_sequences(texts_2)#训练数据的question2集合=>向量
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

In [7]:
texts_1

['what is the step by step guide to invest in share market in india ',
 'what is the story of kohinoor koh - i - noor diamond ',
 'how can i increase the speed of my internet connection while using a vpn ',
 'why am i mentally very lonely how can i solve it ',
 'which one dissolve in water quikly sugar salt methane and carbon di oxide ',
 'astrology : i am a capricorn sun cap moon and cap rising what does that say about me ',
 'should i buy tiago ',
 'how can i be a good geologist ',
 'when do you use instead of ',
 'motorola company : can i hack my charter motorolla dcx3400 ',
 'method to find separation of slits using fresnel biprism ',
 'how do i read and find my youtube comments ',
 'what can make physics easy to learn ',
 'what was your first sexual experience like ',
 'what are the laws to change your status from a student visa to a green card in the us how do they compare to the immigration laws in canada ',
 'what would a trump presidency mean for current international master s

In [8]:
sequences_1

[[2, 3, 1, 1254, 61, 1254, 2924, 8, 578, 7, 759, 370, 7, 35],
 [2, 3, 1, 532, 10, 16574, 11942, 4, 23046, 4746],
 [5, 13, 4, 293, 1, 423, 10, 18, 334, 1724, 153, 127, 6, 2886],
 [15, 47, 4, 3396, 278, 3290, 5, 13, 4, 682, 17],
 [23, 48, 5753, 7, 204, 56723, 1594, 2208, 10729, 12, 1892, 7847, 5207],
 [2922,
  4,
  47,
  6,
  9207,
  921,
  4797,
  825,
  12,
  4797,
  5019,
  2,
  21,
  30,
  206,
  50,
  54],
 [29, 4, 122, 17787],
 [5, 13, 4, 28, 6, 42, 29394],
 [37, 9, 16, 71, 466, 10],
 [7213, 173, 13, 4, 549, 18, 13209, 99751, 98046],
 [1094, 8, 82, 6587, 10, 46447, 127, 39041, 117041],
 [5, 9, 4, 209, 12, 82, 18, 243, 1557],
 [2, 13, 55, 476, 571, 8, 72],
 [2, 63, 36, 96, 1503, 286, 39],
 [2,
  11,
  1,
  1135,
  8,
  184,
  36,
  1031,
  34,
  6,
  214,
  504,
  8,
  6,
  739,
  203,
  7,
  1,
  117,
  5,
  9,
  66,
  366,
  8,
  1,
  2941,
  1135,
  7,
  494],
 [2, 45, 6, 147, 1755, 88, 14, 479, 558, 751, 223, 319, 20, 32, 4053, 504],
 [2, 21, 10794, 88],
 [15, 9, 263, 92, 8, 28,

In [9]:
word_index = tokenizer.word_index#所有词集合的index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)#补全成30维的序列，左边补0
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Found 120499 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [10]:
print(labels[0:10])

[0 0 0 0 0 1 0 1 0 0]


In [11]:
print(data_2[0:10])

[[     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      2      3
       1   1254     61   1254   2924      8    578      7    759    370]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      2     45    217     24      1
      87    316  12785      1  16574  11942      4  23046   4746    193]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       5     13    334    423     28   3425     61   1778    220   8291]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0     82      1   4665
      37    196   2181   1363    196      3   2525     61   1363   2181]
 [     0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0   

In [12]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1
print(nb_words)

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
print(embedding_matrix.shape)

for word, i in word_index.items():#所有词集合的index
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)#每个词已经映射成一个300维的向量
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
120500
(120500, 300)
Null word embeddings: 61789


In [13]:
embedding_matrix[0:2]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [14]:
 word2vec.word_vec('bicarb')

array([-0.05126953, -0.00723267,  0.03222656, -0.16894531,  0.09814453,
       -0.05029297, -0.09375   ,  0.02160645, -0.17675781,  0.10205078,
        0.0703125 , -0.17578125,  0.14550781, -0.10302734, -0.04882812,
        0.23925781, -0.05932617,  0.05615234, -0.02575684, -0.20117188,
        0.04516602, -0.15917969,  0.17675781, -0.10498047, -0.08789062,
       -0.10449219,  0.00683594,  0.265625  , -0.13574219,  0.02111816,
       -0.21484375, -0.09521484,  0.27734375,  0.00061417, -0.24316406,
        0.06176758, -0.03295898, -0.05078125,  0.03173828,  0.00622559,
        0.00263977, -0.17285156,  0.14355469,  0.03466797, -0.07324219,
        0.04760742, -0.07958984,  0.02111816, -0.00421143, -0.21582031,
        0.05541992, -0.10205078, -0.18359375,  0.15820312,  0.1875    ,
        0.03808594,  0.11767578, -0.05297852, -0.08691406,  0.00982666,
       -0.14355469,  0.1171875 ,  0.265625  ,  0.13574219,  0.05053711,
       -0.05493164, -0.18652344, -0.04003906,  0.11669922,  0.05

In [15]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))#上下拼接
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [16]:
data_1_train.shape#363861

(727722, 30)

In [17]:
data_1_train[0:2]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     5,    48,    13,
           72,   262,   420],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     5,    13,
            4,  1017, 12347]])

In [18]:
data_2_train[363861:363863]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     5,    48,    13,
           72,   262,   420],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     5,    13,
            4,  1017, 12347]])

# model structure

In [20]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [23]:
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

# Train 1

In [26]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_190_136_0.29_0.27
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200


In [28]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2], batch_size=1024, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=1024, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('../output/%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning


# Train 2

In [33]:
preds = Dense(1, activation='sigmoid')(merged)

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_232_118_0.39_0.15
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200


In [34]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2], batch_size=1024, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=1024, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('../output/%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning


# Train 3