In [1]:
import numpy as np
import pandas as pd

In [2]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GlobalAveragePooling1D,Lambda,Bidirectional
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from keras import backend as K

#from sklearn.model_selection import train_test_split

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is enabled)


In [3]:
import keras
keras.__version__

'1.1.0'

# Load data

In [4]:
q1_data = np.load(open('q1_train.npy', 'rb'))
q2_data = np.load(open('q2_train.npy', 'rb'))

labels = np.load(open('label_train.npy', 'rb'))
embedding_matrix = np.load(open('word_embedding_matrix.npy', 'rb'))

In [5]:
from sklearn.cross_validation import train_test_split

X = np.stack((q1_data, q2_data), axis=1)
target = labels

X_train, X_val, y_train, y_val = train_test_split(X, target, test_size=0.25, random_state=126, stratify=target)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_val = X_val[:,0]
Q2_val = X_val[:,1]

# set up model


In [36]:
def vec_distance(vects):
    x, y = vects
    return K.sum(K.square(x - y), axis=1, keepdims=True)
#don't use squar root of the sum, it doens't give a good range to feed to the dense layer.

In [37]:
def vec_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


In [38]:
from keras.layers.embeddings import Embedding

nb_words=137077+1
max_sentence_len=25
embedding_layer = Embedding(nb_words,300,
        weights=[embedding_matrix],
        input_length=max_sentence_len,trainable=False)
#dont train this layer!

In [63]:
lstm_layer =LSTM(128)

sequence_1_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

distance=Lambda(vec_distance, output_shape=vec_output_shape)([x1, y1])
dense1=Dense(16, activation='sigmoid')(distance)
dense1 = Dropout(0.3)(dense1)

bn2 = BatchNormalization()(dense1)
prediction=Dense(1, activation='sigmoid')(bn2)

model = Model(input=[sequence_1_input, sequence_2_input], output=prediction)

In [64]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_19 (InputLayer)            (None, 25)            0                                            
____________________________________________________________________________________________________
input_20 (InputLayer)            (None, 25)            0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 25, 300)       0           input_19[0][0]                   
                                                                   input_20[0][0]                   
____________________________________________________________________________________________________
lstm_10 (LSTM)                   (None, 128)           219648      embedding_2[8][0]       

In [65]:
#according to Keras, RMSprop (adaptive LR) is good for recurrent neural net. 
# Adam is another method that computes adaptive learning rates for each parameter. 
#In addition to storing an exponentially decaying average of past squared gradients vtvt like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients mtmt,

###RMSprop as well divides the learning rate by an exponentially decaying average of squared gradients. 
##Adam is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, 
#and is well suited for problems that are large in terms of data and/or parameter
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])

In [66]:

early_stopping =EarlyStopping(monitor='val_loss', patience=3)


In [60]:
#optional: try calculating class weights
#source: stack exchange, J.Guillaumin

import math

# labels_dict : {ind_label: count_label}
# mu : parameter to tune 

def create_class_weight(labels_dict,mu=2):
    total = np.sum(labels_dict.values())
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight


In [61]:
####optional, assign weights to the labels due to imbalanced labels (0,1)
unique, counts = np.unique(target, return_counts=True)
labels_dict=dict(zip(unique, counts))

target_weight=create_class_weight(labels_dict)
target_weight

{0: 1.1539102008862154, 1: 1.6895697721353504}

In [67]:
hist=model.fit([Q1_train, Q2_train], y_train, validation_data=([Q1_val, Q2_val], y_val), verbose=1, 
          nb_epoch=10, batch_size=256, shuffle=True,class_weight=None, callbacks=[early_stopping])
#takes long time to initiate
#using dense() layer and sigmoid activation

Train on 303217 samples, validate on 101073 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 17152/303217 [>.............................] - ETA: 114s - loss: 0.3421 - acc: 0.8528

KeyboardInterrupt: 

# Save model and model weights

In [None]:
from keras.models import model_from_json

In [None]:
# export model to JSON
model_json = model.to_json()
with open("brnn_model_distance_128_d16_d05.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("brnn_model_distance_128_d16_d05.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('lstm_model_distance_128.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("lstm_model_distance_128.h5")
print("Loaded model from disk")
 

In [68]:
test1_data = np.load(open('test1.npy', 'rb'))
test2_data = np.load(open('test2.npy', 'rb'))

In [69]:
pred=model.predict([test1_data, test2_data],verbose=1)




In [70]:
submission=pd.read_csv('sample_submission.csv')

In [71]:
submission['is_duplicate']=pred.clip(1e-5, 0.99999)
submission.to_csv('lstm_submission(13).csv', index=False)

In [72]:
import zipfile

In [73]:
with zipfile.ZipFile('lstm_submission(13).zip', 'w') as myzip:
    myzip.write('lstm_submission(13).csv')