In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = pd.read_csv(".\\data\\train.csv")

# PLAN

We need to find if two questions are similar. In face recognition, we use siamese networks to solve similar problem but for faces. So, we'll try using siamese networks here - only we'll use LSTMs instead of CNNs since LSTMs are suited for sequences.

0. Check the data. If we have enough q-ids for which we have duplicates available, then we could train the whole thing via triplet loss. If so, follow plan a, else plan b.

1. Pre-processing
    - Remove questionmarks throughout
    - Remove stop-words (Save one which keeps stop-words as well)

2. Convert to vectors
3. Divide in 70/30 split. (Also try 80/20 split)
4. Pass through siamese LSTMs

**Plan a**
5. Use squared distance

**Plan b**
5. Use triplet loss (find all those q-ids which have duplicates available for them first. See if it makes sense to use triplet loss)


# Analysis of Data

In [3]:
train_df_copy = train_dataset.copy()

In [4]:
train_df_copy.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,363861.0,363861.0,363861.0,363861.0
mean,181930.0,201899.281913,204884.863951,0.371502
std,105037.767486,144924.825062,146663.968132,0.483207
min,0.0,1.0,2.0,0.0
25%,90965.0,70779.0,70942.0,0.0
50%,181930.0,179999.0,184182.0,0.0
75%,272895.0,321295.0,327744.0,1.0
max,363860.0,493887.0,493889.0,1.0


In [5]:
train_df_copy[train_df_copy['is_duplicate'] > 0].describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,135175.0,135175.0,135175.0,135175.0
mean,181735.176741,156901.917507,157247.986292,1.0
std,105058.800004,137618.6556,137577.456205,0.0
min,5.0,11.0,12.0,1.0
25%,90843.0,39315.0,39697.0,1.0
50%,181718.0,113964.0,113489.0,1.0
75%,272849.5,250886.0,251945.0,1.0
max,363860.0,493877.0,493878.0,1.0


Total number of unique questions whose duplicates we have

In [6]:
len(train_df_copy[train_df_copy['is_duplicate'] > 0]['qid1'].unique())

80105

Total number of unique questions

In [7]:
len(train_df_copy['qid1'].unique())

266358

**Decision** : We could go this path and use triplet loss, however, triplet loss uses A(anchor), P(positive) and N(negative) triplet and it's very important to find a N which is closer to A but still not a duplicate. For us to find those pairs would be a time-taking exercise which I could try to do after basic model, perhaps.

# Pre-processing 

In [8]:
train_df = train_dataset.copy()

#### Prepare a list of all vocabulary words

In [9]:
q1_set = set(train_df['question1'].unique())
q2_set = set(train_df['question2'].unique())
all_ques_list = q1_set | q2_set
len(all_ques_list)

493391

In [10]:
q1_list = train_df['question1'].tolist()
q1_list = [str(ques) for ques in q1_list]
q2_list = train_df['question2'].tolist()
q2_list = [str(ques) for ques in q2_list]
is_duplicate_list = train_df['is_duplicate'].tolist()

print(q1_list[0],":",q2_list[0],":",is_duplicate_list[0])

What is the step by step guide to invest in share market in india? : What is the step by step guide to invest in share market? : 0


In [11]:
from keras.preprocessing.text import Tokenizer

In [20]:
all_questions_list = q1_list + q2_list
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(all_questions_list)

q1_word_seq = tokenizer.texts_to_sequences(q1_list)
q2_word_seq = tokenizer.texts_to_sequences(q2_list)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 91014


In [13]:
import json

In [14]:
# Save the tokenizer word index we've gotten for later

dictionary = word_index
# Let's save this out, so we can use it later
with open('..\\app\\models\\dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [15]:
from zipfile import ZipFile
from os.path import expanduser, exists

In [16]:
from keras.utils import get_file

GLOVE_DOWNLOAD_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

if not exists(expanduser('~/.keras/datasets/glove.840B.300d.zip')):
    zipfile = ZipFile(get_file('glove.840B.300d.zip', GLOVE_DOWNLOAD_URL))
    zipfile.extract('glove.840B.300d.txt', path=expanduser('~/.keras/datasets/'))
    
print("Processing", 'glove.840B.300d.txt')

embeddings_index = {}

with open(expanduser('~/.keras/datasets/glove.840B.300d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Processing glove.840B.300d.txt
Word embeddings: 2196016


In [17]:
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300


nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
print(word_embedding_matrix)

Null word embeddings: 27054
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.27204001 -0.06203    -0.1884     ...  0.13015001 -0.18317001
   0.1323    ]
 [-0.038548    0.54251999 -0.21843    ...  0.11798     0.24590001
   0.22872999]
 ...
 [ 0.27021     1.01320004  0.78776002 ...  0.28852999 -0.056837
  -0.15815   ]
 [ 0.73556     0.31016001  0.33723    ...  0.063972   -0.16123
   0.59724998]
 [-0.010027   -0.45328999  0.44459999 ...  0.82262999  0.024273
   0.12003   ]]


In [18]:
q1_word_seq[0:5]

[[2, 3, 1, 1245, 57, 1245, 2546, 7, 577, 8, 772, 379, 8, 35],
 [2, 3, 1, 562, 10, 13509, 14684, 5, 21440, 4449],
 [4, 13, 5, 219, 1, 439, 10, 17, 364, 1848, 205, 146, 6, 2836],
 [16, 72, 5, 2693, 309, 2764, 4, 13, 5, 661, 19],
 [23, 49, 7202, 8, 233, 33753, 1906, 2077, 10473, 12, 1927, 10671, 6462]]

In [19]:
max_seq_length = 0
for ques in q1_word_seq:
    if len(ques) > max_seq_length:
        max_seq_length = len(ques)

print(max_seq_length)

127


In [20]:
from keras.utils import pad_sequences

In [21]:
MAX_SEQUENCE_LENGTH = 130

q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate_list, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (363861, 130)
Shape of question2 data tensor: (363861, 130)
Shape of label tensor: (363861,)


In [22]:
q1_data[0:5]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     2,
            3,     1,  1245,    57,  1245,  2546,     7,   577,     8,
      

In [23]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X.shape

(363861, 2, 130)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]
Q1_train.shape

(291088, 130)

In [26]:
Q1_train

array([[   0,    0,    0, ...,  860, 1940, 2127],
       [   0,    0,    0, ...,  471, 1170, 2418],
       [   0,    0,    0, ..., 1526,   40, 9135],
       ...,
       [   0,    0,    0, ...,  190,   17,  333],
       [   0,    0,    0, ...,   68,   46,  270],
       [   0,    0,    0, ..., 3525,   10,   19]])

In [27]:
Q1_train.shape

(291088, 130)

In [28]:
from keras.layers import Input, Embedding, LSTM, Dropout, concatenate, Dense, BatchNormalization
from keras.models import Model

In [29]:
NUM_HIDDEN_UNITS_LAYER1 = 50
NUM_HIDDEN_UNITS_LAYER2 = 100

question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)

q1 = embedding_layer(question1)
q2 = embedding_layer(question2)

lstm_first = LSTM(NUM_HIDDEN_UNITS_LAYER1, return_sequences=False)

q1 = lstm_first(q1)
q2 = lstm_first(q2)

dropout_layer = Dropout(0.2)

q1 = dropout_layer(q1)
q2 = dropout_layer(q2)

dense = Dense(100, activation='relu')
dropout_two = Dropout(0.2)
bn_one = BatchNormalization()

q1 = dense(q1)
# q1 = dropout_two(q1)
# q1 = bn_one(q1)
q2 = dense(q2)
# q2 = dropout_two(q2)
# q2 = bn_one(q2)

merged = concatenate([q1,q2])
is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 130)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 130)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 130, 300)     27304500    ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 50)           70200       ['embedding[0][0]',          

In [30]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
model_json = model.to_json()
with open("..\\app\\models\\model1.json", 'w') as json_file:
    json_file.write(model_json)

In [32]:
import datetime
import time
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [37]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()

# early stopping
es = EarlyStopping(monitor="val_accuracy", min_delta=0.01, patience=5, verbose=1, mode='auto')

# model checkpoint
mc = ModelCheckpoint(filepath='..\\app\\models\\question_pairs_weights_type1_final_new.h5', monitor='val_accuracy', save_best_only=True)

# callbacks
cd = [es, mc]

history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=20,
                    validation_data=([Q1_test, Q2_test], y_test),
                    verbose=1,
                    batch_size=512,
                    callbacks=cd)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2023-01-25 10:22:45.230628
Epoch 1/20


2023-01-25 10:22:48.822742: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2023-01-25 10:22:48.965673: I tensorflow/stream_executor/cuda/cuda_blas.cc:1804] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: early stopping
Training ended at 2023-01-25 10:25:02.132054
Minutes elapsed: 2.281686


In [33]:
from keras.preprocessing.text import text_to_word_sequence

In [34]:
def convert_text_to_index_array(text, dictionary):
    words = text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
        return wordIndices

In [35]:
# HAPPY CASE
question1 = "What's r programming?"
question2 = "What's in r programming?"

q1_word_seq = convert_text_to_index_array(question1,dictionary)
q1_word_seq = [q1_word_seq]
q2_word_seq = convert_text_to_index_array(question2,dictionary)
q2_word_seq = [q2_word_seq]
q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)

pred = model.predict([q1_data,q2_data])
print(pred)

[[0.5]]


In [36]:
question1 = "How to learn english?"
question2 = "Why can't I dance?"

q1_word_seq = convert_text_to_index_array(question1,dictionary)
q1_word_seq = [q1_word_seq]
q2_word_seq = convert_text_to_index_array(question2,dictionary)
q2_word_seq = [q2_word_seq]
q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)

pred = model.predict([q1_data,q2_data])
print(pred)

[[0.48648134]]


# Model 2

In [37]:
import keras.backend

In [38]:
def exponent_neg_manhattan_distance(left, right):
    """ Helper function for the similarity estimate of the LSTMs outputs"""
    return keras.backend.exp(-keras.backend.sum(keras.backend.abs(left-right), axis=1, keepdims=True))

In [39]:
from keras.layers import TimeDistributed, Lambda

In [40]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1,
               EMBEDDING_DIM,
               weights=[word_embedding_matrix],
               input_length=MAX_SEQUENCE_LENGTH,
               trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: keras.backend.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)

q2 = Embedding(nb_words + 1,
               EMBEDDING_DIM,
               weights=[word_embedding_matrix],
               input_length=MAX_SEQUENCE_LENGTH,
               trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: keras.backend.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 130)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 130)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 130, 300)     27304500    ['input_3[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, 130, 300)     27304500    ['input_4[0][0]']                
                                                                                            

In [41]:
model_json = model.to_json()
with open("..\\app\\models\\model2.json", 'w') as json_file:
    json_file.write(model_json)

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()

# early stopping
es = EarlyStopping(monitor="val_accuracy", min_delta=0.01, patience=5, verbose=1, mode='auto')

# model checkpoint
mc = ModelCheckpoint(filepath='..\\app\\models\\question_pairs_weights.h5', monitor='val_accuracy', save_best_only=True)

# callbacks
cd = [es, mc]

history_2 = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_data=([Q1_test, Q2_test], y_test),
                    verbose=1,
                    batch_size=64,
                    callbacks=cd)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2023-01-26 18:00:27.471300
Epoch 1/25
 626/4549 [===>..........................] - ETA: 7:27 - loss: 0.6461 - accuracy: 0.6538