In [1]:
import os
import re
import csv
import codecs
import sys
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from nltk import tokenize

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
WORDVECTORS_DIR='wordvectors/'
INPUT_DATA_DIR='inputs/'
WEIGHTS_DIR='weights/'
OUTPUT_DIR='output/'

EMBEDDING_FILE=WORDVECTORS_DIR+'glove.6B.100d.txt'
TRAIN_DATA_FILE=INPUT_DATA_DIR+'train.csv'
TEST_DATA_FILE=INPUT_DATA_DIR+'test.csv'

MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1

In [3]:
#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))  

Total 400000 word vectors.


In [4]:
train_DF=pd.read_csv(TRAIN_DATA_FILE,index_col='id')
test_DF=pd.read_csv(TEST_DATA_FILE)
train_DF.dropna(inplace=True)
train_DF.reset_index(drop=True,inplace=True)

In [5]:
train_DF['question1'] = train_DF['question1'].astype(str)
train_DF['question2'] = train_DF['question2'].astype(str)
test_DF['question1'] = test_DF['question1'].astype(str)
test_DF['question2'] = test_DF['question2'].astype(str)

In [6]:
#embeddings_index["|"]

In [7]:
def text_to_wordlist(text):
    
    # Convert words to lower case and split them
    text = text.lower().split()
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    
    text = re.sub(r"i'm", "i am ", text)
    
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"won't", "wont ", text)
    text = re.sub(r"who's", "who is ", text)
    text = re.sub(r"why's", "why is ", text)
    text = re.sub(r"how's", "how is ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"that's", "that is ", text)
    
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Return a list of words
    return(text)

In [8]:
texts_1 = [] 
texts_2 = []
labels = []
for text1, text2,label in zip(train_DF['question1'],train_DF['question2'],train_DF['is_duplicate']):
    texts_1.append(text_to_wordlist(text1))
    texts_2.append(text_to_wordlist(text2))
    labels.append(int(label))

In [9]:
test_DF.columns

Index(['test_id', 'question1', 'question2'], dtype='object')

In [10]:
test_texts_1 = [] 
test_texts_2 = []
test_ids = []
for text1, text2,test_id in zip(test_DF['question1'],test_DF['question2'],test_DF['test_id']):
    test_texts_1.append(text_to_wordlist(text1))
    test_texts_2.append(text_to_wordlist(text2))
    test_ids.append(int(test_id))

In [11]:
tokenizer=Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1+texts_2+test_texts_1+test_texts_2)

In [12]:
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

In [13]:
data_1=pad_sequences(sequences_1,maxlen=MAX_SEQUENCE_LENGTH)
data_2=pad_sequences(sequences_2,maxlen=MAX_SEQUENCE_LENGTH)
test_data_1=pad_sequences(test_sequences_1,maxlen=MAX_SEQUENCE_LENGTH)
test_data_2=pad_sequences(test_sequences_2,maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
labels = np.array(labels)
test_ids = np.array(test_ids)

In [15]:
word_index = tokenizer.word_index

In [16]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_weights = np.random.random((nb_words, EMBEDDING_DIM))
for word, index in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_weights[index] = embedding_vector

In [17]:
perm = np.random.permutation(len(train_DF))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

In [18]:
embedding_layer = Embedding(input_dim=nb_words,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_weights],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

In [19]:
num_lstm = 100
num_dense = 50
rate_drop_lstm = 0.2
rate_drop_dense = 0.2
act = 'relu'

In [20]:
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

In [23]:
left_input=Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedding_left=embedding_layer(left_input)
left_output=lstm_layer(embedding_left)

right_input=Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedding_right=embedding_layer(left_input)
right_output=lstm_layer(embedding_right)

combined_output=concatenate([left_output,right_output])
combined_output = Dropout(rate_drop_dense)(combined_output)
merged = BatchNormalization()(combined_output)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[left_input, right_input],outputs=preds)
model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['acc'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 30, 100)       12050000                                     
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           80400                                        
____________________________________________________________________________________________________
concatenate_2 (Concatenate)      (None, 200)           0                                            
___________________________________________________________________________________________

In [24]:
STAMP = 'weights_preprocessing_lstm_glove_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense)

In [26]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [27]:
hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val), \
        epochs=20, batch_size=200, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])

Train on 727718 samples, validate on 80858 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


In [28]:
########################################
## make the submission
########################################
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])


In [30]:
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2], batch_size=200, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=200, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning
