In [1]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
import re
import difflib
from nltk.corpus import stopwords
import nltk
import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import pickle
from scipy.spatial.distance import cosine

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import concatenate
from keras.layers.core import Lambda
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
import tensorflow as tf

from sklearn.preprocessing import StandardScaler

import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

Using TensorFlow backend.


In [2]:
########################################
## set directories and parameters
########################################
BASE_DIR = '../input/'
EMBEDDING_FILE = BASE_DIR + 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 300
num_dense1 = np.random.randint(200, 250)
num_dense2 = np.random.randint(150, 200)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d %d_%.2f_%.2f'%(num_lstm, num_dense1, num_dense2, rate_drop_lstm, \
        rate_drop_dense)

In [4]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
count = 0
amline = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
    amline += 1
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

In [3]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = str(text).lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Processing text dataset


In [5]:
texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

In [4]:
########################################
## generate handcrafted features
########################################
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

def get_common_ratio(text_a,text_b):
    """
    ratio of overlapping: 2*M/T
    """
    seq1 = text_to_wordlist(text_a).split()
    seq2 = text_to_wordlist(text_b).split()
    seqMatch = difflib.SequenceMatcher(a=seq1, b=seq2)
    return seqMatch.ratio()

def get_interaction_ratio(text_a,text_b):
    """
    ratio of interactions
    """
    seq1 = set(text_to_wordlist(text_a).split())
    seq2 = set(text_to_wordlist(text_b).split())
    try:
        seqMatch = float(len(seq1.intersection(seq2)))/len(seq1.union(seq2))
    except ZeroDivisionError:
        seqMatch = 0.0
    return seqMatch

def get_common_bigrams_ratio(text_a,text_b):
    seq1 = [" ".join(i) for i in nltk.ngrams(text_to_wordlist(text_a).split(), 2)]
    seq2 = [" ".join(i) for i in nltk.ngrams(text_to_wordlist(text_b).split(), 2)]
    seqMatch = difflib.SequenceMatcher(a=seq1, b=seq2)
    return seqMatch.ratio()

# extract similarity
train_common_ratio = train_df.apply(lambda x: get_common_ratio(x["question1"],x["question2"]),axis = 1)
train_interaction_ratio = train_df.apply(lambda x: get_interaction_ratio(x["question1"],x["question2"]),axis = 1)
train_common_bigrams_ratio = train_df.apply(lambda x: get_common_bigrams_ratio(x["question1"],x["question2"]),axis = 1)

test_common_ratio = test_df.apply(lambda x: get_common_ratio(x["question1"],x["question2"]),axis = 1)
test_interaction_ratio = test_df.apply(lambda x: get_interaction_ratio(x["question1"],x["question2"]),axis = 1)
test_common_bigrams_ratio = test_df.apply(lambda x: get_common_bigrams_ratio(x["question1"],x["question2"]),axis = 1)

def str_jaccard(str1, str2):
    """
    Similar to the get_common_ratio, but with different T, this measure considers the relative order among words
    """
    str1_list = text_to_wordlist(str1).split(" ")
    str2_list = text_to_wordlist(str2).split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res

def str_nlevenshtein_1(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = text_to_wordlist(str1).split(" ")
    str2_list = text_to_wordlist(str2).split(" ")
    res = distance.nlevenshtein(str1_list, str2_list,method=1)
    return res

def str_nlevenshtein_2(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = text_to_wordlist(str1).split(" ")
    str2_list = text_to_wordlist(str2).split(" ")
    res = distance.nlevenshtein(str1_list, str2_list,method=2)
    return res

def str_sorensen(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = text_to_wordlist(str1).split(" ")
    str2_list = text_to_wordlist(str2).split(" ")
    res = distance.sorensen(str1_list, str2_list)
    return res

# extract distance
train_jaccard = train_df.apply(lambda x: str_jaccard(x["question1"],x["question2"]),axis = 1)
test_jaccard = test_df.apply(lambda x: str_jaccard(x["question1"],x["question2"]),axis = 1)

train_nlevenshtein_1 = train_df.apply(lambda x: str_nlevenshtein_1(x["question1"],x["question2"]),axis = 1)
test_nlevenshtein_1 = test_df.apply(lambda x: str_nlevenshtein_1(x["question1"],x["question2"]),axis = 1)

train_nlevenshtein_2 = train_df.apply(lambda x: str_nlevenshtein_2(x["question1"],x["question2"]),axis = 1)
test_nlevenshtein_2 = test_df.apply(lambda x: str_nlevenshtein_2(x["question1"],x["question2"]),axis = 1)

train_sorensen = train_df.apply(lambda x: str_sorensen(x["question1"],x["question2"]),axis = 1)
test_sorensen = test_df.apply(lambda x: str_sorensen(x["question1"],x["question2"]),axis = 1)

def word_len(string):
    return len(text_to_wordlist(string).split())

def char_len(string):
    return len(text_to_wordlist(string).replace(" ",""))

def word_len_diff(text_a, text_b):
    return abs(word_len(text_a) - word_len(text_b))

def char_len_diff(text_a, text_b):
    return abs(char_len(text_a) - char_len(text_b))

def word_match_share(text_a,text_b):
    q1words = {}
    q2words = {}
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

train_word_len_diff = train_df.apply(lambda x: word_len_diff(x["question1"],x["question2"]),axis = 1)
test_word_len_diff = test_df.apply(lambda x: word_len_diff(x["question1"],x["question2"]),axis = 1)

train_char_len_diff = train_df.apply(lambda x: char_len_diff(x["question1"],x["question2"]),axis = 1)
test_char_len_diff = test_df.apply(lambda x: char_len_diff(x["question1"],x["question2"]),axis = 1)

train_word_match_share = train_df.apply(lambda x: word_match_share(x["question1"],x["question2"]),axis = 1)
test_word_match_share = test_df.apply(lambda x: word_match_share(x["question1"],x["question2"]),axis = 1)

In [5]:
hdcft_features = pd.concat([train_common_ratio,
                                  train_interaction_ratio,
                                  train_common_bigrams_ratio,
                                  train_jaccard,
                                  train_nlevenshtein_1,
                                  train_nlevenshtein_2,
                                  train_sorensen,
                                  train_word_len_diff,
                                  train_char_len_diff,
                                  train_word_match_share
                                 ], axis=1)

test_hdcft_features = pd.concat([test_common_ratio,
                                  test_interaction_ratio,
                                  test_common_bigrams_ratio,
                                  test_jaccard,
                                  test_nlevenshtein_1,
                                  test_nlevenshtein_2,
                                  test_sorensen,
                                  test_word_len_diff,
                                  test_char_len_diff,
                                  test_word_match_share
                                 ], axis=1)

In [6]:
ss = StandardScaler()
ss.fit(np.vstack((hdcft_features, test_hdcft_features)))
hdcft_features = ss.transform(hdcft_features)
test_hdcft_features = ss.transform(test_hdcft_features)

In [9]:
########################################
## generate leaky features
########################################
ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)

leaks = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)



In [7]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [8]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
hdcft_features_train = np.vstack((hdcft_features[idx_train], hdcft_features[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
hdcft_features_val = np.vstack((hdcft_features[idx_val], hdcft_features[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [9]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
#lstm_layer2 = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(num_dense1/2, activation=act)(leaks_input)

hdcft_input = Input(shape = (hdcft_features.shape[1],))
hdcft_dense = Dense(num_dense1/2, activation=act)(hdcft_input)

merged = concatenate([x1, y1, leaks_dense, hdcft_input])

#def caldist(cat):
#    return tf.reduce_sum(tf.square(tf.subtract(cat[0],cat[1])), axis=1, keep_dims=True)
#def calangle(cat):
#    return tf.reduce_sum(tf.multiply(cat[0],cat[1]), axis=1, keep_dims=True)

#distance = Lambda(caldist)([x1,y1])
#angle = Lambda(calangle)([x1,y1])

#merged = concatenate([x1, y1, distance, angle, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dense(num_dense1, activation=act)(merged)

merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = Dense(num_dense2, activation=act)(merged)

merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [10]:
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [11]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input, hdcft_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='adadelta',
              #nadam
        metrics=['acc'])
#model.summary()
print(STAMP)

In [12]:
early_stopping =EarlyStopping(monitor='val_loss', patience=20)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train, hdcft_features_train], labels_train, \
        validation_data=([data_1_val, data_2_val, hdcft_features_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

In [13]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2, test_leaks, test_hdcft_features], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks, test_hdcft_features], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

In [14]:
STAMP

'lstm_300_206 190_0.35_0.36'

In [15]:
bst_val_score

0.19184639870695852

0.19209217238223933