In [1]:
import warnings
warnings.filterwarnings('ignore')

# prepare data

In [2]:
import pandas as pd

data = pd.read_csv('sample_data.csv')
print(data.shape)
data.head()

(499, 4)


Unnamed: 0.1,Unnamed: 0,sentences1,sentences2,is_similar
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
s1_col = 'sentences1'
s2_col = 'sentences2'
label_col = 'is_similar'

sentences1 = list(data[s1_col])
sentences2 = list(data[s2_col])
labels = list(data[label_col])

print(sentences1[:2])
print(sentences2[:2])
print(labels[:2])

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?']
['What is the step by step guide to invest in share market?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']
[0, 0]


# create training data

In [4]:
documents = sentences1 + sentences2
doc_words = [x.lower().split() for x in documents]
print(doc_words[:2])

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', '(koh-i-noor)', 'diamond?']]


In [5]:
from input_process_utils import tokenize_doc, obtain_word_embedding, input_process, create_train_valid_test_set

# specify parameters
embedding_dim = 50
max_sentence_length = 30

# tokenize words
tokenizer = tokenize_doc(doc_words)
nwords = len(tokenizer.word_index) + 1
print('total words+1: ', nwords)

# create word embedding matrix
word_embedding_matrix = obtain_word_embedding(tokenizer, doc_words, embedding_dim)

# convert sentences to list of tokens
s1_padded_tokens, s2_padded_tokens = input_process(tokenizer, sentences1, sentences2, max_sentence_length)

# create train, validation, test sets
x1_train, x2_train, y_train, \
x1_valid, x2_valid, y_valid, \
x1_test, x2_test, y_test = create_train_valid_test_set(s1_padded_tokens, s2_padded_tokens, labels, test_size=0.1)

print(x1_train.shape, x1_valid.shape, x1_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

Using TensorFlow backend.


document_count: 998
number of words: 3051
total words+1:  3052
Embedding matrix shape: (3052, 50)
sample word embeddding:
i 5
[ 0.02492686  0.0723003  -0.071944   -0.07385619  0.12691942 -0.06671616
 -0.07998725  0.02979964  0.09013987 -0.1423238   0.04914633 -0.0373216
  0.04891837  0.1338436   0.04649093 -0.05521586 -0.02145877  0.05603057
 -0.00115219  0.09226701  0.08108339 -0.03019674  0.06055864  0.07917215
 -0.14915016  0.14895718 -0.00683145  0.01768599  0.0021265   0.08286751
  0.01430396  0.03134367 -0.1356062  -0.03298447  0.01094953  0.04307833
  0.04760704 -0.05190314 -0.02534769 -0.10370425 -0.039808   -0.00392222
  0.08980277  0.04625122 -0.14988358 -0.04670654 -0.0141106  -0.06206374
  0.08177988  0.03772041]
Null word embeddings: 1
(449, 30) (25, 30) (25, 30)
(449,) (25,) (25,)


# build and train model

In [6]:
from siamese_network import siamese_model

# define parameters for model
number_lstm_units = 100
rate_drop_dense = 0.5
number_dense_units = 100

model_siamese = siamese_model(word_embedding_matrix, nwords, max_sentence_length,
                              embedding_dim,
                              number_lstm_units,
                              number_dense_units, rate_drop_dense,)

In [7]:
model_siamese.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 50)       152600      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200)          120800      embedding_1[0][0]          

In [8]:
model_siamese.fit([x1_train, x2_train], y_train,
                  validation_data=([x1_valid, x2_valid], y_valid),
                  epochs=10, batch_size=32, shuffle=True,
                  )


Train on 449 samples, validate on 25 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x12a027588>

In [None]:
model_file = 'siamese_model.h5'

model_siamese.save(model_file) 

## analyze results

In [None]:
from keras.models import load_model
from siamese_network import exponent_neg_manhattan_distance, f1_metric

model_file = 'siamese_model.h5'
model = load_model(model_file, 
                   custom_objects={'exponent_neg_manhattan_distance': exponent_neg_manhattan_distance,
                                   'f1_metric': f1_metric})

In [None]:
from sklearn.metrics.classification import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

y_test_pred = model.predict([x1_test, x2_test], verbose=1)
y_test_pred_classes = [round(y[0]) for y in y_test_pred.tolist()]

print('accuracy:', accuracy_score(y_test, y_test_pred_classes))

# precision:    shape = [n_unique_labels]
# recall:       shape = [n_unique_labels]
# fbeta_score:  shape = [n_unique_labels]
precision_recall_fscore_support(y_test, y_valid_test_classes)

In [10]:
comp = pd.DataFrame({'label':y_test, 'prob':y_test_pred.tolist(), 'pred':y_test_pred_classes})
comp.head()

Unnamed: 0,label,prob,pred
0,0,[0.5273507237434387],1
1,1,[0.49098411202430725],0
2,1,[0.5263671875],1
3,1,[0.1259659081697464],0
4,0,[0.44758933782577515],0


In [13]:
comp[comp['label'] != comp['pred']].shape

(9, 3)