# prepare data

In [1]:
import pandas as pd

data = pd.read_csv('sample_data.csv')
print(data.shape)
data.head()

(499, 4)


Unnamed: 0.1,Unnamed: 0,sentences1,sentences2,is_similar
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [2]:
s1_col = 'sentences1'
s2_col = 'sentences2'
label_col = 'is_similar'

sentences1 = list(data[s1_col])
sentences2 = list(data[s2_col])
labels = list(data[label_col])

print(sentences1[:2])
print(sentences2[:2])
print(labels[:2])

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?']
['What is the step by step guide to invest in share market?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']
[0, 0]


# create training data

In [3]:
documents = sentences1 + sentences2
doc_words = [x.lower().split() for x in documents]
print(doc_words[:2])

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', '(koh-i-noor)', 'diamond?']]


In [6]:
from input_process_utils import tokenize_doc, obtain_word_embedding, input_process, create_train_valid_test_set

# specify parameters
embedding_dim = 50
max_sentence_length = 30

# tokenize words
tokenizer = tokenize_doc(doc_words)
nwords = len(tokenizer.word_index) + 1
print('total words+1: ', nwords)

# create word embedding matrix
word_embedding_matrix = obtain_word_embedding(tokenizer, doc_words, embedding_dim)

# convert sentences to list of tokens
s1_padded_tokens, s2_padded_tokens = input_process(tokenizer, sentences1, sentences2, max_sentence_length)

# create train, validation, test sets
x1_train, x2_train, y_train, \
x1_valid, x2_valid, y_valid, \
x1_test, x2_test, y_test = create_train_valid_test_set(s1_padded_tokens, s2_padded_tokens, labels, test_size=0.1)

print(x1_train.shape, x1_valid.shape, x1_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

W0725 14:48:11.882972 4569712064 base_any2vec.py:723] consider setting layer size to a multiple of 4 for greater performance
W0725 14:48:11.987967 4569712064 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


document_count: 998
number of words: 3051
total words+1:  3052
Embedding matrix shape: (3052, 50)
sample word embeddding:
i 5
[-0.06770796 -0.04137032 -0.0268636   0.0089477   0.04354147 -0.04058208
 -0.07212066 -0.0330251  -0.13802154  0.02702054 -0.06765643 -0.05425958
 -0.05956003  0.02756251  0.00650712  0.00328051 -0.0828038   0.08157284
  0.06269407  0.06183026  0.02676531  0.04294371 -0.03089175  0.02704246
 -0.06714765  0.02264272  0.01460735  0.07577525 -0.05026695 -0.07081524
 -0.06845475 -0.10819329  0.04025434  0.00892521  0.0383963   0.07519003
  0.04873734 -0.05430204  0.06258007 -0.07511717 -0.00385681 -0.07947874
 -0.10794722  0.03477678  0.1472071   0.03254943  0.06410009  0.0796771
  0.09287953 -0.08372698]
Null word embeddings: 1
(449, 30) (25, 30) (25, 30)
(449,) (25,) (25,)


# model

In [7]:
from siamese_network import build_model

# define parameters for model
number_lstm_units = 100
rate_drop_dense = 0.5
number_dense_units = 100

model_siamese = build_model(word_embedding_matrix, nwords, max_sentence_length,
                            embedding_dim,
                            number_lstm_units,
                            number_dense_units, rate_drop_dense,)

W0725 14:48:14.320415 4569712064 deprecation_wrapper.py:119] From /Users/jinmei/.pyenv/versions/3.6.7/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0725 14:48:14.335611 4569712064 deprecation_wrapper.py:119] From /Users/jinmei/.pyenv/versions/3.6.7/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0725 14:48:14.338003 4569712064 deprecation_wrapper.py:119] From /Users/jinmei/.pyenv/versions/3.6.7/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0725 14:48:14.347140 4569712064 deprecation_wrapper.py:119] From /Users/jinmei/.pyenv/versions/3.6.7/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please

In [9]:
model_siamese.fit([x1_train, x2_train], y_train,
                  validation_data=([x1_valid, x2_valid], y_valid),
                  epochs=10, batch_size=32, shuffle=True,
                  )

W0725 14:48:27.461007 4569712064 deprecation.py:323] From /Users/jinmei/.pyenv/versions/3.6.7/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 449 samples, validate on 25 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13223c2e8>

In [10]:
preds = model_siamese.predict([x1_valid, x2_valid], verbose=1)
pred_labels = [round(x[0]) for x in preds.tolist()]



In [11]:
comp = pd.DataFrame({'label':y_test, 'prob':preds.tolist(), 'pred':pred_labels})
comp.head()

Unnamed: 0,label,prob,pred
0,0,[0.5563027858734131],1
1,1,[0.49119246006011963],0
2,1,[0.5192320346832275],1
3,1,[0.13289517164230347],0
4,0,[0.4359603524208069],0


In [12]:
fn = comp[(comp['label'] != comp['pred']) & (comp['label'] == 1)]
fn.shape

(7, 3)

In [13]:
tp = comp[(comp['label'] == comp['pred']) & (comp['label'] == 1)]
tp.shape

(4, 3)

In [14]:
comp[comp['label'] != comp['pred']].shape

(9, 3)

In [15]:
predicted_p = comp[comp['pred'] == 1]
predicted_p.shape                  

(6, 3)