## Improvements brought in this notebook:-
1.   Added features like number of common words in question pair
2.   Not stemming the words now
3.   Using glove embeddings in the 2nd half of notebook


## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow  as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import re
import string

AUTOTUNE = tf.data.AUTOTUNE

BATCH_SIZE = 1536
MAX_LENGTH = 64
VOCAB_SIZE = 200000
D_MODEL = 300


## Reading Data

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/train.csv')


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score


In [5]:
X_train, X_val = train_test_split(data,test_size=0.2,random_state=99)



In [7]:
test_data = pd.read_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/test.csv')
test_data = test_data[test_data['test_id']!='life in dublin?"'].copy()
test_data['test_id'] = test_data['test_id'].map(int)
test_data = test_data.drop_duplicates()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
len(X_train),len(X_val)

(370166, 34124)

## Cleaning Text

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english')) - set(['not','what','why','how','who','whom','which'])
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
def remove_html_tags(text):
  text = re.sub(r'<.*?>','',text)
  return text

def remove_special_characters(text):
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"can't", "cannot ", text)
  text = re.sub(r"n't", " not ", text)
  text = re.sub(r"i'm", "i am ", text)
  text = re.sub(r"\'re", " are ", text)
  text = re.sub(r"\'d", " would ", text)
  text = re.sub(r"\'ll", " will ", text)
  text = re.sub(r",", " ", text)
  text = re.sub(r"\.", " ", text)
  text = re.sub(r"!", " ! ", text)
  text = re.sub(r"\/", " ", text)
  text = re.sub(r"\^", " ^ ", text)
  text = re.sub(r"\+", " + ", text)
  text = re.sub(r"\-", " - ", text)
  text = re.sub(r"\=", " = ", text)
  text = re.sub(r"'", " ", text)
  text = re.sub(r":", " : ", text)
  text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
  text = re.sub(r" e g ", " eg ", text)
  text = re.sub(r" b g ", " bg ", text)
  text = re.sub(r" u s ", " american ", text)
  text = re.sub(r" 9 11 ", "911", text)
  text = re.sub(r"e - mail", "email", text)
  text = re.sub(r"j k", "jk", text)
  text = re.sub(r"\s{2,}", " ", text)
  return text

def lower_the_text(text):
  return text.lower()

def tokenize_text(text):
  return word_tokenize(text)

def remove_stopwords(tokenized_text):
  return [word for word in tokenized_text if word not in stop_words]

def stem_text(tokenized_text):
  return [stemmer.stem(word) for word in tokenized_text]


def clean_text(text,tokenize_text_flag = False,rem_stopwords_flag = False, stem_text_flag = False,return_string = True):
  text = remove_html_tags(text)
  text = remove_special_characters(text)
  text = lower_the_text(text)
  
  if(tokenize_text_flag):
    text = tokenize_text(text)

    if rem_stopwords_flag:
      text = remove_stopwords(text)
    if stem_text_flag:
      text = stem_text(text)
    
    if(return_string):
      return " ".join(text)

  return text
  
  

## Preparing Data Generators for training

In [11]:
import random
def ques_pair_generator_function(questions_list1,questions_list2,y_list = None,shuffle = False,clean_ques = True,clean_text = clean_text):
  def ques_pair_generator():
    n_ques = len(questions_list1)
    index_list = [i for i in range(n_ques)]
    if(shuffle == True):
      random.shuffle(index_list)
    i = -1
    while True:
      i = i + 1
      if(i == n_ques):
        i = 0
        if(shuffle == True):
          random.shuffle(index_list)

      q1 = str(questions_list1[index_list[i]])
      q2 = str(questions_list2[index_list[i]])

      if(clean_ques):
        q1 = clean_text(q1)
        q2 = clean_text(q2)
        

      if(y_list is not None):
        y = y_list[index_list[i]]

      if(y_list is None):
        yield q1,q2,np.array([len(q1)/1.0,len(q2)/1.0,len(set(q1.split())&set(q2.split()))/1.0])
      else:
        yield q1,q2,np.array([len(q1)/1.0,len(q2)/1.0,len(set(q1.split())&set(q2.split()))/1.0]),y

  return ques_pair_generator


In [12]:
train_generator = ques_pair_generator_function(X_train['question1'].to_list(),X_train['question2'].to_list(),X_train['is_duplicate'].to_list(),shuffle = True)
val_generator = ques_pair_generator_function(X_val['question1'].to_list(),X_val['question2'].to_list(),X_val['is_duplicate'].to_list(),shuffle = False)
test_generator = ques_pair_generator_function(test_data['question1'].to_list(),test_data['question2'].to_list(),shuffle = False)

all_generator = ques_pair_generator_function(test_data['question1'].to_list()+X_val['question1'].to_list()+X_train['question1'].to_list(),test_data['question2'].to_list()+X_val['question2'].to_list()+X_train['question2'].to_list(),shuffle = False)

In [6]:

train_size = X_train.shape[0]
val_size = X_val.shape[0]
test_size = test_data.shape[0]


#### Tensorflow generators

In [14]:
raw_train_dataset = tf.data.Dataset.from_generator(train_generator,output_signature = (tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape=(3,), dtype=tf.float32),tf.TensorSpec(shape=(), dtype=tf.int32)))
raw_train_dataset = raw_train_dataset.batch(BATCH_SIZE)

raw_val_dataset = tf.data.Dataset.from_generator(val_generator,output_signature = (tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape=(3,), dtype=tf.float32),tf.TensorSpec(shape=(), dtype=tf.int32)))
raw_val_dataset = raw_val_dataset.batch(BATCH_SIZE)

raw_test_dataset = tf.data.Dataset.from_generator(test_generator,output_signature = (tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape=(3,), dtype=tf.float32)))
raw_test_dataset = raw_test_dataset.map(lambda q1,q2,lengths: (q1,q2,lengths,-1))
raw_test_dataset = raw_test_dataset.batch(BATCH_SIZE)

raw_all_dataset = tf.data.Dataset.from_generator(all_generator,output_signature = (tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape = (), dtype = tf.string),tf.TensorSpec(shape=(3,), dtype=tf.float32)))
raw_all_dataset = raw_all_dataset.map(lambda q1,q2,lengths: (q1,q2,lengths,-1))
raw_all_dataset = raw_all_dataset.batch(BATCH_SIZE)


#### Vectorizer Layer

In [15]:
vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_LENGTH)


In [16]:
all = raw_all_dataset.map(lambda q1,q2,l,y: tf.concat([q1,q2],axis = 0)).prefetch(1)

In [17]:
# vectorize_layer.adapt(all,steps = (train_size + val_size + test_size)//BATCH_SIZE + 1)

In [18]:
# import pickle
# pickle.dump({'config': vectorize_layer.get_config(),
#              'weights': vectorize_layer.get_weights()}
#             , open("/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/vectorize_layer.pkl", "wb"))

In [19]:
import pickle
from_disk = pickle.load(open("/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/vectorize_layer.pkl", "rb"))
vectorize_layer = layers.TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
vectorize_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
vectorize_layer.set_weights(from_disk['weights'])


In [20]:
len(vectorize_layer.get_vocabulary())

120569

In [21]:
def vectorize_ques(q1,q2,lengths,label):
  return (vectorize_layer(q1), vectorize_layer(q2),lengths),label


#### Tensorflow Datasets

In [22]:
train_dataset = raw_train_dataset.map(vectorize_ques).cache().prefetch(buffer_size = 1)
val_dataset = raw_val_dataset.map(vectorize_ques).cache().prefetch(buffer_size = 1)
test_dataset = raw_test_dataset.map(vectorize_ques).cache().prefetch(buffer_size = 1)


## Training

### Building Siamese model architecture

In [None]:
def build_siamese_network(vocab_size = VOCAB_SIZE,d_model = D_MODEL,dropout_rate = 0.20,batch_size = BATCH_SIZE,max_length = MAX_LENGTH): 

  def build_tf_lstm_model():
    encoded_question = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question')
    embeddings = layers.Embedding(vocab_size,d_model,input_length=max_length,name = 'embedding_layer')(encoded_question)
    layer_1 =  layers.Bidirectional(layers.LSTM(d_model,activation = 'tanh',return_sequences=True,dropout = dropout_rate,stateful  = True,name = 'lstm_1'),name = 'bidirectional_1')(embeddings)
    layer_2 =  layers.Bidirectional(layers.LSTM(d_model,return_sequences=False,stateful  = True,name = 'lstm_2'),name = 'bidirectional_2')(layer_1)
    lstm_model  = Model(inputs = [encoded_question],outputs = [layer_2],name = 'LSTM')

    return lstm_model


  
  encoded_question1 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question1')
  encoded_question2 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question2')
  
  
  
  lstm_model = build_tf_lstm_model()
  
  embedded_questions1 = lstm_model(encoded_question1)
  embedded_questions2 = lstm_model(encoded_question2)

  lengths = layers.Input(batch_shape = (batch_size,3),name = 'input_lengths')
  lengths = layers.BatchNormalization(name = 'batch_norm_lengths')(lengths)

  lengths_hidden = layers.Dense(d_model//2,name = 'lengths_hidden')(lengths)
  lengths_hidden = layers.BatchNormalization()(lengths_hidden)
  lengths_hidden = layers.Activation('relu')(lengths_hidden)
  lengths_hidden = layers.Dropout(dropout_rate)(lengths_hidden)

  lengths_hidden = layers.Dense(d_model,name = 'lengths_hidden_2')(lengths_hidden)

  concatenated = tf.concat([embedded_questions1,embedded_questions2,lengths_hidden],axis = -1)
  concatenated = layers.BatchNormalization()(concatenated)
  concatenated = layers.Dropout(0.20)(concatenated)

  concatenated_dense = layers.Dense(d_model,name = 'concatenated_dense')(concatenated)
  concatenated_dense = layers.BatchNormalization()(concatenated_dense)
  concatenated_dense = layers.Activation('relu')(concatenated_dense)
  concatenated_dense = layers.Dropout(0.20)(concatenated_dense)

  concatenated_dense = layers.Dense(d_model//2,name = 'concatenated_dense_2')(concatenated_dense)
  concatenated_dense = layers.BatchNormalization()(concatenated_dense)
  concatenated_dense = layers.Activation('relu')(concatenated_dense)
  concatenated_dense = layers.Dropout(0.20)(concatenated_dense)

  
  output = layers.Dense(1,name = 'output_logit')(concatenated_dense)

  NN_model = Model(inputs = [embedded_questions1,embedded_questions2,lengths_hidden],outputs = [output])

  output1 = NN_model((embedded_questions1,embedded_questions2,lengths_hidden))
  output2 = NN_model((embedded_questions2,embedded_questions1,lengths_hidden))

  output_logit = (output1 + output2)/2

  model  = Model(inputs = [encoded_question1,encoded_question2,lengths],outputs = [output_logit])
  
  return model


In [None]:
batch_size = BATCH_SIZE
siamese_model = build_siamese_network()

siamese_model.compile(loss  =  tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                optimizer = Adam(learning_rate  = 5e-3),
                metrics = [tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.BinaryIoU()])

early_stopping = EarlyStopping(min_delta = 0.01,patience = 3,restore_best_weights=True)



In [None]:
siamese_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(1536, 3)]          0           []                               
                                                                                                  
 lengths_hidden (Dense)         (1536, 150)          600         ['input_4[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (1536, 150)         600         ['lengths_hidden[1][0]']         
 alization)                                                                                       
                                                                                                  
 activation (Activation)        (1536, 150)          0           ['batch_normalization[1][0]

#### Training

In [None]:
history = siamese_model.fit(train_dataset,
                            steps_per_epoch = train_size//batch_size + 1,
                            epochs = 20,
                            validation_data=val_dataset,
                            validation_steps = val_size//batch_size + 1,
                            callbacks = [early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [None]:
siamese_model.save('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_with_features.h5')


### Log-loss on Kaggle test set:- Private: 0.42957 Public: 0.43021




Further improvements could be brought by:-

1.   Use glove embeddings instead of training new embeddings
2.   Experiment with the d_model
3.   Using BERT models or more complex models

## Replacing embedding layer with Glove embeddigs

In [None]:
# voc = vectorize_layer.get_vocabulary()
# word_index = dict(zip(voc, range(len(voc))))


In [None]:
# !wget http://nlp.stanford.edu/data/glove.840B.300d.zip -P "/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/"
# !unzip -q "/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/glove.840B.300d.zip"

In [None]:
# %%time
# path_to_glove_file = os.path.join("/content/glove.840B.300d.txt")

# embeddings_index = {}
# f = open(path_to_glove_file, encoding='utf-8')

# # for line in tqdm(f):
# for line in f:
#     values = line.split()
#     # word = values[0]
#     word = (''.join(values[:-300])).lower()   
#     # coefs = np.asarray(values[1:], dtype='float32')
#     coefs = np.asarray(values[-300:], dtype='float32')
#     if(word in embeddings_index):
#       embeddings_index[word].append(coefs)
#     else:
#       embeddings_index[word] = [coefs]


# f.close()

# print("Found %s word vectors." % len(embeddings_index))


In [None]:
# for word in embeddings_index:
#   embeddings_index[word] = np.mean(embeddings_index[word],axis = 0)


In [1]:
# num_tokens = len(voc) + 2
# embedding_dim = 300
# hits = 0
# misses = 0
# missed_words = []
# # Prepare embedding matrix
# embedding_matrix = np.zeros((num_tokens, embedding_dim))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # Words not found in embedding index will be all-zeros.
#         # This includes the representation for "padding" and "OOV"
#         embedding_matrix[i] = embedding_vector
#         hits += 1
#     else:
#         missed_words.append(word)
#         misses += 1
# print("Converted %d words (%d misses)" % (hits, misses))


Creating embedding layer

In [None]:
# embedding_layer = layers.Embedding(
#     num_tokens,
#     embedding_dim,
#     embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
#     trainable=False,
#     name = 'embedding_layer_300'
# )

In [None]:
# import pickle
# pickle.dump({'config': embedding_layer.get_config(),
#              'weights': embedding_layer.get_weights()}
#             , open("/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/embedding_layer.pkl", "wb"))

In [23]:
import pickle
from_disk = pickle.load(open("/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/embedding_layer.pkl", "rb"))
embedding_layer = layers.Embedding.from_config(from_disk['config'])


In [24]:
def build_siamese_network_glove(vocab_size = VOCAB_SIZE,d_model = D_MODEL,dropout_rate = 0.20,batch_size = BATCH_SIZE,max_length = MAX_LENGTH): 

  def build_tf_lstm_model_glove():
    encoded_question = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question')
    embeddings = embedding_layer(encoded_question)
    layer_1 =  layers.Bidirectional(layers.LSTM(d_model,activation = 'tanh',return_sequences=True,dropout = dropout_rate,stateful  = True,name = 'lstm_1'),name = 'bidirectional_1')(embeddings)
    layer_2 =  layers.Bidirectional(layers.LSTM(d_model,return_sequences=False,stateful  = True,name = 'lstm_2'),name = 'bidirectional_2')(layer_1)
    lstm_model  = Model(inputs = [encoded_question],outputs = [layer_2],name = 'LSTM')

    return lstm_model


  
  encoded_question1 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question1')
  encoded_question2 = layers.Input(batch_shape = (batch_size,max_length),name = 'input_encoded_question2')
  
  
  
  lstm_model = build_tf_lstm_model_glove()
  
  embedded_questions1 = lstm_model(encoded_question1)
  embedded_questions2 = lstm_model(encoded_question2)

  lengths = layers.Input(batch_shape = (batch_size,3),name = 'input_lengths')
  lengths = layers.BatchNormalization(name = 'batch_norm_lengths')(lengths)

  lengths_hidden = layers.Dense(d_model//2,name = 'lengths_hidden1')(lengths)
  lengths_hidden = layers.BatchNormalization()(lengths_hidden)
  lengths_hidden = layers.Activation('relu')(lengths_hidden)
  lengths_hidden = layers.Dropout(dropout_rate)(lengths_hidden)

  lengths_hidden = layers.Dense(d_model,name = 'lengths_hidden2')(lengths_hidden)

  concatenated = tf.concat([embedded_questions1,embedded_questions2,lengths_hidden],axis = -1)
  concatenated = layers.BatchNormalization()(concatenated)
  concatenated = layers.Activation('relu')(concatenated)
  concatenated = layers.Dropout(0.20)(concatenated)

  concatenated_dense = layers.Dense(d_model,name = 'concatenated_dense1')(concatenated)
  concatenated_dense = layers.BatchNormalization()(concatenated_dense)
  concatenated_dense = layers.Activation('relu')(concatenated_dense)
  concatenated_dense = layers.Dropout(0.20)(concatenated_dense)

  concatenated_dense = layers.Dense(d_model//2,name = 'concatenated_dense2')(concatenated_dense)
  concatenated_dense = layers.BatchNormalization()(concatenated_dense)
  concatenated_dense = layers.Activation('relu')(concatenated_dense)
  concatenated_dense = layers.Dropout(0.20)(concatenated_dense)

  
  output = layers.Dense(1,name = 'output_logit')(concatenated_dense)

  NN_model = Model(inputs = [embedded_questions1,embedded_questions2,lengths_hidden],outputs = [output])

  output1 = NN_model((embedded_questions1,embedded_questions2,lengths_hidden))
  output2 = NN_model((embedded_questions2,embedded_questions1,lengths_hidden))

  output_logit = (output1 + output2)/2

  model  = Model(inputs = [encoded_question1,encoded_question2,lengths],outputs = [output_logit])
  
  return model


In [26]:
batch_size = BATCH_SIZE
siamese_model = build_siamese_network_glove(batch_size = batch_size)

siamese_model.compile(loss  =  tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                optimizer='nadam',
                metrics = [tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.BinaryIoU()])

early_stopping = EarlyStopping(min_delta = 0.01,patience = 5,restore_best_weights=True)



In [33]:
siamese_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(1536, 3)]          0           []                               
                                                                                                  
 lengths_hidden1 (Dense)        (1536, 150)          600         ['input_12[0][0]']               
                                                                                                  
 batch_normalization_8 (BatchNo  (1536, 150)         600         ['lengths_hidden1[1][0]']        
 rmalization)                                                                                     
                                                                                                  
 activation_7 (Activation)      (1536, 150)          0           ['batch_normalization_8[1][

In [34]:
history = siamese_model.fit(train_dataset,
                            batch_size = BATCH_SIZE,
                            steps_per_epoch = train_size//batch_size + 1,
                            epochs = 20,
                            validation_data=val_dataset,
                            validation_steps = val_size//batch_size + 1,
                            callbacks = [early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


In [35]:
siamese_model.save('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_with_features_and_glove.h5')


### Log-loss on Kaggle test set:- Private: 0.36949 Public: 0.36655




### Prediction on test data

#### Prediction using model without glove embeddings

In [None]:
batch_size = 1024
siamese_model = build_siamese_network_glove(batch_size = batch_size)

In [None]:
siamese_model.load_weights('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_with_features.h5')

In [None]:
y_test_predict = siamese_model.predict(test_dataset,steps=test_data.shape[0]//batch_size + 1,verbose = 1)



In [None]:
test_data['is_duplicate'] = y_test_predict[:test_data.shape[0]]

In [None]:
test_data['is_duplicate'] = test_data['is_duplicate'].apply(lambda x : np.exp(x)/(np.exp(x) + 1))

In [None]:
submission = test_data[['test_id','is_duplicate']].copy()
submission.to_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/submission_with_features_.csv',index = False)

#### Prediction using model with glove embeddings

In [None]:
batch_size = 1024
siamese_model = build_siamese_network_glove(batch_size = batch_size)

In [None]:
siamese_model.load_weights('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_with_features_and_glove.h5')

In [None]:
y_test_predict = siamese_model.predict(test_dataset,steps=test_data.shape[0]//batch_size + 1,verbose = 1)



In [None]:
test_data['is_duplicate'] = y_test_predict[:test_data.shape[0]]

In [None]:
test_data['is_duplicate'] = test_data['is_duplicate'].apply(lambda x : np.exp(x)/(np.exp(x) + 1))

In [None]:
submission = test_data[['test_id','is_duplicate']].copy()
submission.to_csv('/content/drive/MyDrive/Machine_Learning/NLP/Text Similarity/quora-questions/siamese_with_features_and_glove.csv',index = False)