# source of inspiration:

https://engineering.quora.com/Semantic-Question-Matching-with-Deep-Learning
https://keras.io/getting-started/functional-api-guide/

In [1]:
import numpy as np
import pandas as pd

In [12]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GlobalAveragePooling1D,Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from keras import backend as K

from sklearn.model_selection import train_test_split

In [3]:
import keras
keras.__version__

'2.0.3'

# Load data

In [4]:
q1_data = np.load(open('q1_train.npy', 'rb'))
q2_data = np.load(open('q2_train.npy', 'rb'))

labels = np.load(open('label_train.npy', 'rb'))
embedding_matrix = np.load(open('word_embedding_matrix.npy', 'rb'))

In [5]:
X = np.stack((q1_data, q2_data), axis=1)
target = labels

X_train, X_val, y_train, y_val = train_test_split(X, target, test_size=0.25, random_state=126, stratify=target)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_val = X_val[:,0]
Q2_val = X_val[:,1]

# set up model


In [6]:
def vec_distance(vects):
    x, y = vects
    return K.sum(K.square(x - y), axis=1, keepdims=True)

In [7]:
def vec_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [8]:
nb_words=137077+1
max_sentence_len=25
embedding_layer = Embedding(nb_words,300,
        weights=[embedding_matrix],
        input_length=max_sentence_len,
        trainable=False)

In [13]:
lstm_layer = LSTM(128)

sequence_1_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)


sequence_2_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)


distance=Lambda(vec_distance, output_shape=vec_output_shape)([x1, y1])
dense1=Dense(16, activation='relu')(distance)
bn1 = BatchNormalization()(dense1)
prediction=Dense(1, activation='sigmoid')(bn1)


model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=prediction)


In [14]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 25)            0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 25)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 25, 300)       41123400    input_5[0][0]                    
                                                                   input_6[0][0]                    
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 128)           219648      embedding_1[4][0]       

In [15]:
model.compile(loss='binary_crossentropy',
        optimizer=Adam(1e-3),
        metrics=['acc'])

In [16]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)

In [None]:
#optional: try calculating class weights
#source: stack exchange, J.Guillaumin
import math

# mu : parameter to tune 

def create_class_weight(labels_dict,mu=2):
    total = np.sum(labels_dict.values())
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight


In [None]:
####optional, assign weights to the labels due to imbalanced data. The output dictionary can be used in model.fit later.
unique, counts = np.unique(target, return_counts=True)
labels_dict=dict(zip(unique, counts))

target_weight=create_class_weight(labels_dict)
target_weight

In [17]:
hist=model.fit([Q1_train, Q2_train], y_train, validation_data=([Q1_val, Q2_val], y_val), verbose=1, 
          epochs=20, batch_size=128, shuffle=True,class_weight=None, callbacks=[early_stopping])

Train on 303217 samples, validate on 101073 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 14720/303217 [>.............................] - ETA: 923s - loss: 0.2557 - acc: 0.8995

KeyboardInterrupt: 

In [None]:
print(history.history.keys())

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Save model and model weights

In [18]:
from keras.models import model_from_json

In [19]:
# export model to JSON
model_json = model.to_json()
with open("lstm_model_distance_128_16.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("lstm_model_distance_128_16.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
# load json and create model
json_file = open('lstm_model_64_16.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("lstm_model_64_16.h5")
print("Loaded model from disk")
 

In [20]:
test1_data = np.load(open('test1.npy', 'rb'))
test2_data = np.load(open('test2.npy', 'rb'))

In [21]:
pred=model.predict([test1_data, test2_data],verbose=1)



In [22]:
submission=pd.read_csv('sample_submission.csv')

In [23]:
submission['is_duplicate']=pred.clip(1e-5, 0.99999)
submission.to_csv('lstm_submission(4).csv', index=False)

In [24]:
submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.04228
1,1,0.014484
2,2,0.231607
3,3,0.00081
4,4,0.074459
