## Questions with same meaning?
This project uses the dataset containing many questions with label whether they are the same meaning.

In [1]:
import numpy as np
import pandas as pd
import os
import csv
import codecs
from collections import Counter
import matplotlib.pyplot as plt
import operator
import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.porter import *

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalAveragePooling1D
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
import sys
from keras.layers.merge import add, concatenate
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.20

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Read Data & primary processing

In [2]:
# All data files are in input dir.
def Get_Data_Location(File_Dir, Trainset_name, Testset_name):
    Train_File = File_Dir + Trainset_name
    Test_File = File_Dir + Testset_name
    return Train_File, Test_File

def Read_Data(Train_File, Test_File):
    data_train = pd.read_csv(Train_File, nrows=400000)
    data_test = pd.read_csv(Test_File, nrows=4000)
    data_train.drop_duplicates(inplace=True)
    data_train.dropna(inplace=True)
#     data_test.drop_duplicates(inplace=True)
#     data_test.dropna(inplace=True)
    print ("Shape of train File = ", data_train.shape)
    print ("Shape of test File = ", data_test.shape)
    print (data_train.head(2))
    print (data_test.head(2))
    return data_train, data_test

In [7]:
Vector_DIR = './input/'
print('Indexing word vectors.')
embeddings_index = {}
f = codecs.open(os.path.join(Vector_DIR, 'vectors.txt'), encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 16143 word vectors.


In [9]:
Train_File, Test_File = Get_Data_Location("./input/", "train.csv", "test.csv")
data_train, data_test = Read_Data(Train_File, Test_File)

Shape of train File =  (399997, 6)
Shape of test File =  (4000, 3)
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
   test_id                                          question1  \
0        0  How does the Surface Pro himself 4 compare wit...   
1        1  Should I have a hair transplant at age 24? How...   

                                           question2  
0  Why did Microsoft choose core m3 and not core ...  
1        How much cost does hair transplant require?  


Remove the duplicated questions and get the statistic value of words.

In [10]:
def statistic_traindata(data_train):
    Num_not_duplicate = data_train.is_duplicate.value_counts()[0]
    Num_duplicate = data_train.is_duplicate.value_counts()[1]
    print("Num_not_duplicate =", Num_not_duplicate)
    print("Num_duplicate =", Num_duplicate)

    total_question_list = data_train.qid1.tolist() + data_train.qid2.tolist()
    total_unique_question = np.unique(total_question_list)
    print("Total number of unique question =", len(total_unique_question))

    question_ids_counter = Counter(total_question_list)
    More_than_once_question = []
    for i in question_ids_counter.values():
        if i > 1:
            More_than_once_question.append(i)
    print("Number of questions more than once =", len(More_than_once_question))
    return Num_not_duplicate, Num_duplicate, total_unique_question, More_than_once_question

In [11]:
Num_not_duplicate, Num_duplicate, total_unique_question, More_than_once_question = statistic_traindata(data_train)

Num_not_duplicate = 252228
Num_duplicate = 147769
Total number of unique question = 533318
Number of questions more than once = 110450


###  Generate dictionary
Here, we tokenize the sentences to get words from the questions.
Then, use porter stemmer to break down words into their basic form.
Also use NLTK stopwords to ignore basic words and genism to train dictionary.

In [12]:
token_word = re.compile(r"\w+",re.I)
stopword = stopwords.words('english')
stemmer = PorterStemmer()

def tokenize_questions(df):
    question_1_tokenized = []
    question_2_tokenized = []

    for q in df.question1.tolist():
        question_1_tokenized.append([stemmer.stem(i.lower()) for i in token_word.findall(q) if i not in stopword])

    for q in df.question2.tolist():
        question_2_tokenized.append([stemmer.stem(i.lower()) for i in token_word.findall(q) if i not in stopword])

    df["Question_1_tok"] = question_1_tokenized
    df["Question_2_tok"] = question_2_tokenized
    
    return df

def train_dictionary(df):
    
    questions_tokenized = df.Question_1_tok.tolist() + df.Question_2_tok.tolist()
    
    dictionary = corpora.Dictionary(questions_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary
    
data_train = tokenize_questions(data_train)
dictionary = train_dictionary(data_train)
print ("No. of words in the dictionary = %s" %len(dictionary.token2id))

data_test = tokenize_questions(data_test)

No. of words in the dictionary = 21357


In [13]:
def get_vectors(data, dictionary):
    
    question1_vec = [dictionary.doc2bow(text) for text in data.Question_1_tok.tolist()]
    question2_vec = [dictionary.doc2bow(text) for text in data.Question_2_tok.tolist()]
    
    question1_csc = gensim.matutils.corpus2csc(question1_vec, num_terms=len(dictionary.token2id))
    question2_csc = gensim.matutils.corpus2csc(question2_vec, num_terms=len(dictionary.token2id))
    
    return question1_csc.transpose(),question2_csc.transpose()

q1_csc, q2_csc = get_vectors(data_train, dictionary)
test_q1_csc, test_q2_csc = get_vectors(data_test, dictionary)

print (q1_csc.shape)
print (q2_csc.shape)
print (test_q2_csc.shape)
print (test_q2_csc.shape)

(399997, 21357)
(399997, 21357)
(4000, 21357)
(4000, 21357)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as jsc
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import MinMaxScaler

minkowski_dis = DistanceMetric.get_metric('minkowski')
mms_scale_man = MinMaxScaler()
mms_scale_euc = MinMaxScaler()
mms_scale_mink = MinMaxScaler()

def get_similarity_values(q1_csc, q2_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    jaccard_dis = []
    minkowsk_dis = []
    
    for i,j in zip(q1_csc, q2_csc):
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
#         sim = md(i,j)
#         manhattan_dis.append(sim[0][0])
#         sim = ed(i,j)
#         eucledian_dis.append(sim[0][0])
#         i_ = i.toarray()
#         j_ = j.toarray()
#         try:
#             sim = jsc(i_,j_)
#             jaccard_dis.append(sim)
#         except:
#             jaccard_dis.append(0)
            
#         sim = minkowski_dis.pairwise(i_,j_)
#         minkowsk_dis.append(sim[0][0])
    
    return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis    


# cosine_sim = get_cosine_similarity(q1_csc, q2_csc)
cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis = get_similarity_values(q1_csc, q2_csc)
test_cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis = get_similarity_values(test_q1_csc, test_q2_csc)

print ("cosine_sim sample= \n", cosine_sim[0:2])
print ("test cosine_sim sample= \n", test_cosine_sim[0:2])
# print ("manhattan_dis sample = \n", manhattan_dis[0:2])
# print ("eucledian_dis sample = \n", eucledian_dis[0:2])
# print ("jaccard_dis sample = \n", jaccard_dis[0:2])
# print ("minkowsk_dis sample = \n", minkowsk_dis[0:2])

# eucledian_dis_array = np.array(eucledian_dis).reshape(-1,1)
# manhattan_dis_array = np.array(manhattan_dis).reshape(-1,1)
# minkowsk_dis_array = np.array(minkowsk_dis).reshape(-1,1)
    
# manhattan_dis_array = mms_scale_man.fit_transform(manhattan_dis_array)
# eucledian_dis_array = mms_scale_euc.fit_transform(eucledian_dis_array)
# minkowsk_dis_array = mms_scale_mink.fit_transform(minkowsk_dis_array)

# eucledian_dis = eucledian_dis_array.flatten()
# manhattan_dis = manhattan_dis_array.flatten()
# minkowsk_dis = minkowsk_dis_array.flatten()

cosine_sim sample= 
 [0.9486832980505138, 0.6154574548966638]
test cosine_sim sample= 
 [0.3698001308168194, 0.6454972243679029]


In [15]:
print(len(cosine_sim))
print(len(test_cosine_sim))

399997
4000


In [16]:
dist = np.array([cosine_sim]).T
test_dist = np.array([test_cosine_sim]).T
# dist = [cosine_sim]

In [17]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_train.Question_1_tok.tolist() + data_train.Question_2_tok.tolist() + data_test.Question_1_tok.tolist() + data_test.Question_2_tok.tolist())
sequences_1 = tokenizer.texts_to_sequences(data_train.Question_1_tok.tolist())
sequences_2 = tokenizer.texts_to_sequences(data_train.Question_2_tok.tolist())
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_sequences_1 = tokenizer.texts_to_sequences(data_test.Question_1_tok.tolist())
test_sequences_2 = tokenizer.texts_to_sequences(data_test.Question_2_tok.tolist())

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(data_train.is_duplicate.tolist())
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_labels = np.array(data_test.test_id.tolist())
# del test_sequences_1
# del test_sequences_2
# del sequences_1
# del sequences_2
import gc
gc.collect()

Found 65659 unique tokens.
Shape of data tensor: (399997, 30)
Shape of label tensor: (399997,)


0

In [18]:
word_index['NULL_Value'] = 0

In [53]:
print('Preparing embedding matrix.')
# prepare embedding matrix
# nb_words = len(word_index)
nb_words = min(MAX_NB_WORDS, len(word_index))

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
print(embedding_matrix.shape)

Preparing embedding matrix.
Null word embeddings: 49518
(65660, 50)


In [20]:
def model_conv1D_(emb_matrix):
    
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=60,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    seq2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])

    # Add the magic features
#     magic_input = Input(shape=(5,))
#     magic_dense = BatchNormalization()(magic_input)
#     magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    distance_input = Input(shape=(1,))
    distance_dense = BatchNormalization()(distance_input)
    distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
#     merge = concatenate([diff, mul, magic_dense, distance_dense])
    merge = concatenate([diff, mul, distance_dense])
    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

#     model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2, distance_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [21]:
print(data_1.shape)
print(dist.shape)
print(test_dist.shape)

(399997, 30)
(399997, 1)
(4000, 1)


In [54]:
# pass
model = model_conv1D_(embedding_matrix)
model.fit([data_1,data_2,dist], labels, validation_split=VALIDATION_SPLIT, epochs=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2, test_dist])
print(model.summary())
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

Train on 319997 samples, validate on 80000 samples
Epoch 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_48 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
input_49 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 60, 50)       3283000     input_48[0][0]                   
                                                                 input_49[0][0]                   
__________________________________________________________________________________________________
conv1d_97 (Conv1D)              (None, 60, 128) 

In [23]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [80]:
from keras.layers.merge import add, concatenate
# Model Architecture #
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences_1)
x1 = MaxPooling1D(10)(x1)
x1 = Flatten()(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = Conv1D(128, 3, activation='relu')(embedded_sequences_2)
y1 = MaxPooling1D(10)(y1)
y1 = Flatten()(y1)
y1 = Dense(64, activation='relu')(y1)
y1 = Dropout(0.2)(y1)

# merged = merge([x1,y1], mode='concat')
merged = concatenate([x1,y1])
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input,sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [81]:
# pass
model.fit([data_1,data_2], labels, validation_split=VALIDATION_SPLIT, epochs=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2])
print(model.summary())
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

Train on 319997 samples, validate on 80000 samples
Epoch 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_78 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
input_79 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 50)       3283000     input_78[0][0]                   
                                                                 input_79[0][0]                   
__________________________________________________________________________________________________
conv1d_145 (Conv1D)             (None, 28, 128) 

In [None]:
with open("vectors_test.txt","w") as f:
    for i in data_train.Question_1_tok.tolist():
        for j in i:
            f.write(j+" ")
    for i in data_train.Question_2_tok.tolist():
        for j in i:
            f.write(j+" ")
            

In [66]:
def model_rnn1D_(emb_matrix):
    
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=60,
        trainable=False
    )
    
    
    rnn = LSTM(256,dropout=0.2,recurrent_dropout = 0.1)

    # Define inputs
    seq1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    seq2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    rnn1a = rnn(emb1)
    rnn1b = rnn(emb2)
    

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(256,))([rnn1a, rnn1b])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(256,))([rnn1a, rnn1b])

    # Add the magic features
#     magic_input = Input(shape=(5,))
#     magic_dense = BatchNormalization()(magic_input)
#     magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    distance_input = Input(shape=(1,))
    distance_dense = BatchNormalization()(distance_input)
    distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
#     merge = concatenate([diff, mul, magic_dense, distance_dense])
    merge = concatenate([diff, mul, distance_dense])
    # The MLP that determines the outcome

    pred = Dense(1, activation='sigmoid')(merge)

#     model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2, distance_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [67]:
# pass
model = model_rnn1D_(embedding_matrix)
model.fit([data_1,data_2,dist], labels, validation_split=VALIDATION_SPLIT, epochs=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2, test_dist])
print(model.summary())
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

Train on 319997 samples, validate on 80000 samples
Epoch 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_64 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
input_65 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 60, 50)       3283000     input_64[0][0]                   
                                                                 input_65[0][0]                   
__________________________________________________________________________________________________
input_66 (InputLayer)           (None, 1)       

In [68]:
from keras.layers import GlobalMaxPooling1D
def model_mix1D_(emb_matrix):
    
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=60,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')
    rnn = LSTM(256,dropout=0.2,recurrent_dropout = 0.1)
    
    # Define inputs
    seq1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    seq2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalMaxPooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalMaxPooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalMaxPooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalMaxPooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalMaxPooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalMaxPooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalMaxPooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalMaxPooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalMaxPooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalMaxPooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalMaxPooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalMaxPooling1D()(conv6b)
    
    rnn1a = rnn(emb1)
    rnn1b = rnn(emb2)
    
    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a,rnn1a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b,rnn1b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32 +256,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32 + 256,))([mergea, mergeb])

    # Add the magic features
#     magic_input = Input(shape=(5,))
#     magic_dense = BatchNormalization()(magic_input)
#     magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    distance_input = Input(shape=(1,))
    distance_dense = BatchNormalization()(distance_input)
    distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
#     merge = concatenate([diff, mul, magic_dense, distance_dense])
    merge = concatenate([diff, mul, distance_dense])
    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)

    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation='sigmoid')(x)

#     model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model = Model(inputs=[seq1, seq2, distance_input], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [69]:
# pass
model = model_mix1D_(embedding_matrix)
model.fit([data_1,data_2,dist], labels, validation_split=VALIDATION_SPLIT, epochs=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2, test_dist])
print(model.summary())
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

Train on 319997 samples, validate on 80000 samples
Epoch 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_67 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
input_68 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 60, 50)       3283000     input_67[0][0]                   
                                                                 input_68[0][0]                   
__________________________________________________________________________________________________
conv1d_121 (Conv1D)             (None, 60, 128) 

In [None]:
data_1.shape

In [None]:
type(data_1)

In [None]:
# dis = dis.T.shape

In [None]:
dis.shape
type(dis)