In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

import tensorflow  as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import pickle

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading the data

In [7]:
data = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/train.csv')


In [8]:
data.shape

(404290, 6)

In [9]:
print(data['is_duplicate'].value_counts(normalize = True))
print(data.shape)

0    0.630802
1    0.369198
Name: is_duplicate, dtype: float64
(404290, 6)


In [10]:
data[data['is_duplicate'] == 0].sample(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
94690,94690,97582,158061,What are the health benefits of cranberry-grap...,How do you determine the gluten in grape nuts?...,0
13412,13412,25759,25760,How can I learn digital marketing step by step...,What are good ways to learn to become the best...,0
13159,13159,23832,5545,How do you delete a picture from instagram on ...,How do I delete a picture on Instagram?,0
231198,231198,23989,238250,"What is the difference between molecular mass,...",What is the difference between molecular mass ...,0
204375,204375,307250,307251,How do I implement a PID Controllers for heate...,Why is a P controller used for a flow process?,0


In [11]:
data[data['is_duplicate'] == 1].sample(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
101014,101014,16472,101660,What are the creepiest paranormal experience y...,What is the scariest paranormal encounter you'...,1
58235,58235,30182,28681,How is borderline personality disorder (BPD) t...,What is borderline personality disorder?,1
257474,257474,372797,372798,Will my Q&A disappear if I delete my Quora acc...,What happens to the questions and answers you ...,1
219458,219458,251578,195748,What are some of the best hangout places in Pune?,What are some places to spend the weekend near...,1
148219,148219,233754,233755,Are there any good software companies in Singa...,Which are good software companies in Singapore?,1


We need to clean our questions text before we use them for training any model. As the questions are repeated in our training set, it makes sense to make a dataframe with unique questions, clean them. We can then use our cleaned questions to train the model while spending less time cleaning the questions.

In [12]:
questions = pd.concat([data[['qid1','question1']].drop_duplicates().rename({'qid1':'qid','question1':'question'},axis = 1),data[['qid2','question2']].drop_duplicates().rename({'qid2':'qid','question2':'question'},axis = 1) ]).sort_values('qid')
questions = questions.drop_duplicates()

In [13]:
data = data[['id','qid1','qid2','is_duplicate']].copy()

In [14]:
questions.head()

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
0,2,What is the step by step guide to invest in sh...
1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
1,4,What would happen if the Indian government sto...
2,5,How can I increase the speed of my internet co...


## Cleaning Text

In [15]:

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english')) - set(['not','what','why','how','who','whom','which'])
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
def remove_html_tags(text):
  text = re.sub(r'<.*?>','',text)
  return text

def remove_special_characters(text):
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"can't", "cannot ", text)
  text = re.sub(r"n't", " not ", text)
  text = re.sub(r"i'm", "i am ", text)
  text = re.sub(r"\'re", " are ", text)
  text = re.sub(r"\'d", " would ", text)
  text = re.sub(r"\'ll", " will ", text)
  text = re.sub(r",", " ", text)
  text = re.sub(r"\.", " ", text)
  text = re.sub(r"!", " ! ", text)
  text = re.sub(r"\/", " ", text)
  text = re.sub(r"\^", " ^ ", text)
  text = re.sub(r"\+", " + ", text)
  text = re.sub(r"\-", " - ", text)
  text = re.sub(r"\=", " = ", text)
  text = re.sub(r"'", " ", text)
  text = re.sub(r":", " : ", text)
  text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
  text = re.sub(r" e g ", " eg ", text)
  text = re.sub(r" b g ", " bg ", text)
  text = re.sub(r" u s ", " american ", text)
  text = re.sub(r" 9 11 ", "911", text)
  text = re.sub(r"e - mail", "email", text)
  text = re.sub(r"j k", "jk", text)
  text = re.sub(r"\s{2,}", " ", text)
  return text

def lower_the_text(text):
  return text.lower()

def tokenize_text(text):
  return word_tokenize(text)

def remove_stopwords(tokenized_text):
  return [word for word in tokenized_text if word not in stop_words]

def stem_text(tokenized_text):
  return [stemmer.stem(word) for word in tokenized_text]


def clean_ques(text,rem_stopwords_flag = False, stem_text_flag = False,return_string = True):
  text = remove_html_tags(text)
  text = remove_special_characters(text)
  text = lower_the_text(text)
  tokenized_text = tokenize_text(text)

  if rem_stopwords_flag:
    tokenized_text = remove_stopwords(tokenized_text)
  if stem_text_flag:
    tokenized_text = stem_text(tokenized_text)
  
  if(return_string):
    return " ".join(tokenized_text)

  return tokenized_text
  
  

Cleaning and saving our cleaned questions dataframe to be reused each time we run the program

In [17]:
# %%time
# questions['cleaned_question'] = questions['question'].apply(lambda x: clean_ques(str(x),rem_stopwords_flag = False, stem_text_flag = True,return_string = True) )

In [18]:
# questions.to_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/train_cleaned_questions.csv')


reading saved cleaned questions

In [19]:
questions = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/train_cleaned_questions.csv')
questions['cleaned_question'] = questions['cleaned_question'].map(str)
questions = questions[['qid','cleaned_question']].copy()

In [20]:
questions['cleaned_question'].apply(lambda x : len((x).split())).describe()

count    537933.000000
mean         12.919239
std           6.889712
min           1.000000
25%           9.000000
50%          11.000000
75%          15.000000
max         272.000000
Name: cleaned_question, dtype: float64

In [21]:
questions['cleaned_question'].apply(lambda x : len(x.split())).quantile([0.90,0.95,0.99,0.995,0.999])

0.900    22.0
0.950    27.0
0.990    37.0
0.995    43.0
0.999    61.0
Name: cleaned_question, dtype: float64

Most of the cleaned questions have lengths <= 64 words. We can use this upper bound to efficiently build our model. It doesn't make sense to build a model that can handle 256 words in questions, as it is not memory efficient.

## Train-Validation split

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score


In [23]:
X_train, X_val = train_test_split(data,test_size=0.2,random_state=99)



X_train and X_valid only contains questions ids. We need to map back the questions text to the data frames for training.

In [24]:
X_train = X_train.merge(questions[['qid','cleaned_question']],left_on = 'qid1',right_on = 'qid', how = 'left').rename({'cleaned_question':'question1'},axis = 1)
X_train = X_train.merge(questions[['qid','cleaned_question']],left_on = 'qid2',right_on = 'qid', how = 'left').rename({'cleaned_question':'question2'},axis = 1)

X_val = X_val.merge(questions[['qid','cleaned_question']],left_on = 'qid1',right_on = 'qid', how = 'left').rename({'cleaned_question':'question1'},axis = 1).drop('qid',axis = 1)
X_val = X_val.merge(questions[['qid','cleaned_question']],left_on = 'qid2',right_on = 'qid', how = 'left').rename({'cleaned_question':'question2'},axis = 1).drop('qid',axis = 1)



In [25]:
X_train['question1'] = X_train['question1'].map(str)
X_train['question2'] = X_train['question2'].map(str)

X_val['question1'] = X_val['question1'].map(str)
X_val['question2'] = X_val['question2'].map(str)


## Building Tensorflow Model

In [32]:
VOCAB_SIZE = 8000
BATCH_SIZE = 2048
LEARNING_RATE = 5e-3

vocab_size = VOCAB_SIZE
batch_size = BATCH_SIZE
learning_rate = LEARNING_RATE

#### Tokenizer and Padding



In [33]:

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size,oov_token=1)


Building and saving tokenizer for repeated use.



In [34]:
# tokenizer.fit_on_texts(np.concatenate([X_train['question1'].values,X_train['question2'].values]))

# # saving
# with open('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# loading
with open('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [35]:
def pad_encoded_texts(encoded_texts,max_length = 64):
  padded_encoded_texts = []
  for encoded_text in encoded_texts:
    encoded_text = encoded_text[:max_length]
    encoded_text = encoded_text + [0]*(max_length - len(encoded_text))
    padded_encoded_texts.append(encoded_text)
  
  return np.array(padded_encoded_texts)

In [36]:
## Data generators for training

In [37]:
import random

def get_encoded_ques_pair_generator_function(questions_list1,questions_list2,y_list = None,shuffle = False,clean_ques_flag = True):
  def get_encoded_ques_pair_generator(questions_list1 = questions_list1,question_list2 = questions_list1,shuffle = shuffle,clean_ques_flag = clean_ques_flag):
    n_ques = len(questions_list1)
    index_list = [i for i in range(n_ques)]
    if(shuffle == True):
      random.shuffle(index_list)
    i = -1
    while True:
      i = i + 1
      if(i == n_ques):
        i = 0
        if(shuffle == True):
          random.shuffle(index_list)
      
      # print(index_list)
      q1 = questions_list1[index_list[i]]
      q2 = questions_list2[index_list[i]]
      if(y_list is not None):
        y = y_list[index_list[i]]
      if(clean_ques_flag):
        q1 = clean_ques(q1)
        q2 = clean_ques(q2)

      if(y_list is None):
        yield pad_encoded_texts(tokenizer.texts_to_sequences([q1]))[0],pad_encoded_texts(tokenizer.texts_to_sequences([q2]))[0]
      else:
        yield pad_encoded_texts(tokenizer.texts_to_sequences([q1]))[0],pad_encoded_texts(tokenizer.texts_to_sequences([q2]))[0],y

  return get_encoded_ques_pair_generator


In [38]:
train_generator = get_encoded_ques_pair_generator_function(X_train['question1'].to_list(),X_train['question2'].to_list(),X_train['is_duplicate'].to_list(),shuffle = True,clean_ques_flag = False)
val_generator = get_encoded_ques_pair_generator_function(X_val['question1'].to_list(),X_val['question2'].to_list(),X_val['is_duplicate'].to_list(),shuffle = False,clean_ques_flag = False)


Creating TF datasets for training

In [39]:
train_dataset = tf.data.Dataset.from_generator(train_generator,output_signature = (tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape=(), dtype=tf.int32)))
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.map(lambda q1,q2,y: ((q1,q2),y))
train_dataset = train_dataset.prefetch(1)




In [40]:
val_dataset = tf.data.Dataset.from_generator(val_generator,output_signature = (tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape=(), dtype=tf.int32)))
val_dataset = val_dataset.batch(batch_size)
val_dataset = val_dataset.map(lambda q1,q2,y: ((q1,q2),y))
val_dataset = val_dataset.prefetch(1)


### Building Siamese model architecture

In [43]:
def build_siamese_network(vocab_size = vocab_size,d_model = 256,batch_size = 64,max_length = 64): 

  def build_tf_lstm_model():
    encoded_question = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question')
    embeddings = layers.Embedding(vocab_size,d_model,input_length=max_length,name = 'embedding_layer')(encoded_question)
    layer_1 =  layers.LSTM(d_model,return_sequences=True,stateful  = True,name = 'lstm_1')(embeddings)
    layer_2 =  layers.LSTM(d_model,return_sequences=True,stateful  = True,name = 'lstm_2')(layer_1)
    flat_layer_2 = layers.Flatten(name = 'flatten_lstm_2')(layer_2)
    hidden_1 = layers.Dense(d_model,name = 'hidden_1')(flat_layer_2)
    hidden_1 = layers.LayerNormalization()(hidden_1)
    lstm_model  = Model(inputs = [encoded_question],outputs = [hidden_1])

    return lstm_model


  
  encoded_question1 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question1')
  encoded_question2 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question2')
  
  lstm_model = build_tf_lstm_model()
  
  embedded_questions1 = lstm_model(encoded_question1)
  embedded_questions2 = lstm_model(encoded_question2)

  similarity = layers.Dot(axes=(1, 1))([embedded_questions1, embedded_questions2])

  output = layers.Dense(1, activation="sigmoid")(similarity)

  model  = Model(inputs = [encoded_question1,encoded_question2],outputs = [output])
  
  return model

### Training

In [44]:
batch_size = BATCH_SIZE
learning_rate = LEARNING_RATE


siamese_model = build_siamese_network(batch_size = batch_size)

siamese_model.compile(loss  =  "binary_crossentropy", 
                optimizer = Adam(learning_rate  = learning_rate),
                metrics = [tf.keras.metrics.BinaryAccuracy()])

early_stopping = EarlyStopping(min_delta = 0.01,patience = 3,restore_best_weights=True)

history = siamese_model.fit(train_dataset,
                            batch_size = batch_size,
                            steps_per_epoch = X_train.shape[0]//batch_size + 1,
                            epochs = 20,
                            validation_data=val_dataset,
                            validation_steps = X_val.shape[0]//batch_size + 1,
                            callbacks = [early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [45]:
siamese_model.save('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_baseline_0.h5')

### Log-loss on Kaggle test set:- Private: 0.56847 Public: 0.56748

Further improvements could be brought by:-
1.   Using euclidean distance for simialarity instead of cosine similarity
2.   Feature engineering like number of common words, characters in question pairs, etc.
3.   Don't stem the words
4.   Use glove embeddings instead of training new embeddings
5.   Experiment with the d_model
6.   Using BERT models or more complex models




#### Replacing dot product with euclidean distance for similarity

In [46]:
import tensorflow.keras.backend as K
def build_siamese_network_euclidean_similarity(vocab_size = vocab_size,d_model = 256,batch_size = 64,max_length = 64): 
  def build_tf_lstm_model():
    encoded_question = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question')
    embeddings = layers.Embedding(vocab_size,d_model,input_length=max_length,name = 'embedding_layer')(encoded_question)
    layer_1 =  layers.LSTM(d_model,return_sequences=True,stateful  = True,name = 'lstm_1')(embeddings)
    layer_2 =  layers.LSTM(d_model,return_sequences=True,stateful  = True,name = 'lstm_2')(layer_1)
    flat_layer_2 = layers.Flatten(name = 'flatten_lstm_2')(layer_2)
    hidden_1 = layers.Dense(d_model,name = 'hidden_1')(flat_layer_2)
    hidden_1 = layers.LayerNormalization()(hidden_1)
    lstm_model  = Model(inputs = [encoded_question],outputs = [hidden_1])

    return lstm_model

  def euclidean_distance(vectors):
    (featsA, featsB) = vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1,
      keepdims=True)
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

  
  encoded_question1 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question1')
  encoded_question2 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question2')
  
  lstm_model = build_tf_lstm_model()
  
  embedded_questions1 = lstm_model(encoded_question1)
  embedded_questions2 = lstm_model(encoded_question2)

  similarity = layers.Lambda(euclidean_distance)([embedded_questions1, embedded_questions2])

  output = layers.Dense(1, activation="sigmoid")(similarity)

  model  = Model(inputs = [encoded_question1,encoded_question2],outputs = [output])
  
  return model

In [47]:
batch_size = BATCH_SIZE
siamese_model = build_siamese_network_euclidean_similarity(batch_size = batch_size)

siamese_model.compile(loss  =  "binary_crossentropy", 
                optimizer = Adam(learning_rate  = learning_rate),
                metrics = [tf.keras.metrics.BinaryAccuracy()])

early_stopping = EarlyStopping(min_delta = 0.01,patience = 3,restore_best_weights=True)

history = siamese_model.fit(train_dataset,
                            batch_size = batch_size,
                            steps_per_epoch = X_train.shape[0]//batch_size + 1,
                            epochs = 20,
                            validation_data=val_dataset,
                            validation_steps = X_val.shape[0]//batch_size + 1,
                            callbacks = [early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [48]:
siamese_model.save('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_baseline_1.h5')

Model could be trained more as we can still see model is improving with each interation. But we are only experimenting for now and we learnt that using euclidean distance for similarity gives better result.

### Log-loss on Kaggle test set:- Private: 0.47721 Public: 0.47593

### Prediction on test data

#### Predicting using model with dot product

In [62]:
batch_size = 1024
siamese_model = build_siamese_network(batch_size = batch_size)

In [63]:
siamese_model.load_weights('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_baseline_0.h5')

In [64]:
test_data = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/test.csv')
test_data = test_data[test_data['test_id']!='life in dublin?"'].copy()
test_data['test_id'] = test_data['test_id'].map(int)
test_data = test_data.drop_duplicates()
test_generator = get_encoded_ques_pair_generator_function(test_data['question1'].map(str).to_list(),test_data['question2'].map(str).to_list(),None,shuffle = False,clean_ques_flag = True)
test_dataset = tf.data.Dataset.from_generator(test_generator,output_signature = (tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape = (64,), dtype = tf.int32)))
test_dataset = test_dataset.batch(batch_size)
test_dataset = test_dataset.map(lambda q1,q2: ((q1,q2),1))
test_dataset = test_dataset.prefetch(2)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [65]:
y_test_predict = siamese_model.predict(test_dataset,steps=test_data.shape[0]//batch_size + 1,verbose = 1)



In [66]:
test_data['is_duplicate'] = y_test_predict[:test_data.shape[0]]

In [67]:
submission = test_data[['test_id','is_duplicate']].copy()
submission.to_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/submission_baseline_0.csv',index = False)


#### Predicting using model with euclidean distance

In [52]:
batch_size = 1024
siamese_model = build_siamese_network_euclidean_similarity(batch_size = batch_size)

In [53]:
siamese_model.load_weights('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_baseline_1.h5')

In [54]:
test_data = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/test.csv')
test_data = test_data[test_data['test_id']!='life in dublin?"'].copy()
test_data['test_id'] = test_data['test_id'].map(int)
test_data = test_data.drop_duplicates()
test_generator = get_encoded_ques_pair_generator_function(test_data['question1'].map(str).to_list(),test_data['question2'].map(str).to_list(),None,shuffle = False,clean_ques_flag = True)
test_dataset = tf.data.Dataset.from_generator(test_generator,output_signature = (tf.TensorSpec(shape = (64,), dtype = tf.int32),tf.TensorSpec(shape = (64,), dtype = tf.int32)))
test_dataset = test_dataset.batch(batch_size)
test_dataset = test_dataset.map(lambda q1,q2: ((q1,q2),1))
test_dataset = test_dataset.prefetch(2)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [55]:
y_test_predict = siamese_model.predict(test_dataset,steps=test_data.shape[0]//batch_size + 1,verbose = 1)



In [56]:
test_data['is_duplicate'] = y_test_predict[:test_data.shape[0]]

In [57]:
submission = test_data[['test_id','is_duplicate']].copy()
submission.to_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/submission_baseline_1.csv',index = False)


Further improvements could be brought by:-
1.   Feature engineering like number of common words, characters in question pairs, etc.
2.   Don't stem the words
3.   Use glove embeddings instead of training new embeddings
4.   Experiment with the d_model
5.   Using BERT models or more complex models