# Paraphrase Detection - NLP Course Assignment

## created by : 
- Kaleab Taye - UGR/0490/12
- Estifanos Neway - UGR/4776/12
- Beka Dessalegn - UGR/4605/12

## Description
In this project we have attempted to build and train a model that determine whether two given text entities, such as sentences, convey the same meaning in different words. In order to obtain the paraphrase detection capablity we have tried to implement Bidirectional Long Short-Term Memory (Bi-LSTM) algorithm. The model is designed to analyze both syntactic and semantic features of input text pairs, providing a robust solution to the paraphrase detection challenge.


In [6]:
# Collecting the needed packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Permute, dot, add, concatenate
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Activation,MaxPooling2D,Bidirectional,Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
import spacy
sp = spacy.load('en_core_web_sm')

## Read Dataset

In [8]:
# data = pd.read_csv(project_path+"questions.csv",nrows=10000)
project_path = '/kaggle/input/corpus2/'
data = pd.read_csv(project_path+"questions.csv",nrows=20000) # the number of rows to be used

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Preprocess Data

In [9]:
# Tokenize, convert to lowercase, remove punctuation, and filter tokens 
table = str.maketrans('', '', string.punctuation)
def clean_question(text): 
    doc = sp(text)
    text = [token.lemma_ for token in doc]
    text = [word.lower() for word in text]
    text = [w.translate(table) for w in text]
    text = [word for word in text if len(word)>1]
    text = [word for word in text if word.isalpha()]
    return ' '.join(text)

In [10]:
data["question1"] = data["question1"].apply(lambda x:clean_question(x))
data["question2"] = data["question2"].apply(lambda x:clean_question(x))

## Feature Extraction
To capture semantic meaning, pre-trained GloVe word embeddings are employed. The tokenized and preprocessed question sequences are padded to a fixed length, facilitating effective model training.

In [12]:
# fit a tokenizer with questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["question1"].values+data["question2"].values)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 27022


In [13]:
# create sequences
max_len = 25
q1_texts_seq = tokenizer.texts_to_sequences(data["question1"].values)
q2_texts_seq = tokenizer.texts_to_sequences(data["question2"].values)

q1_texts_seq = pad_sequences(q1_texts_seq,maxlen=max_len)
q2_texts_seq = pad_sequences(q2_texts_seq,maxlen=max_len)

In [14]:
# Load Glove vectors
embeddings_index = {} # empty dictionary
f = open(os.path.join("/kaggle/input/glove/", 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [16]:
X = np.stack((q1_texts_seq, q2_texts_seq), axis=1)
y = data["is_duplicate"].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Get Question 1/2  train and test features
q1_X_train = X_train[:,0]
q2_X_train = X_train[:,1]

q1_X_test = X_test[:,0]
q2_X_test = X_test[:,1]

## Build Model

Lets build **Bidirectional Long Short-Term Memory with Gated Relevance Network** for Paraphrase Detection

In [19]:
class GatedRelevanceNetwork(Layer):
    def __init__(self, output_dim,
            weights_initializer="glorot_uniform",
            bias_initializer="zeros", **kwargs):
        self.output_dim = output_dim
        self.weights_initializer = weights_initializer
        self.bias_initializer = bias_initializer
        super(GatedRelevanceNetwork, self).__init__(**kwargs)

    def build(self, input_shape):
        batch_size, len1, emb_dim = input_shape[0]
        _, len2, _ = input_shape[1]
        # Weights initialization
        # Bilinear Tensor Product weights
        self.Wb = self.add_weight(name='weights_btp',
                                  shape=(self.output_dim, emb_dim, emb_dim),
                                  initializer=self.weights_initializer,
                                  trainable=True)

        # Single Layer Network weights
        self.Wd = self.add_weight(name='weights_sln',
                                  shape=(2*emb_dim, self.output_dim),
                                  initializer=self.weights_initializer,
                                  trainable=True)

        # Gate weights
        self.Wg = self.add_weight(name='weights_gate',
                                  shape=(2*emb_dim, self.output_dim),
                                  initializer=self.weights_initializer,
                                  trainable=True)

        # Gate bias
        self.bg = self.add_weight(name='bias_gate',
                                  shape=(self.output_dim,),
                                  initializer=self.bias_initializer,
                                  trainable=True)

        # General bias
        self.b = self.add_weight(name='bias',
                                 shape=(self.output_dim,),
                                 initializer=self.bias_initializer,
                                 trainable=True)

        # Channel weights
        self.u = self.add_weight(name="channel_weights",
                                 shape=(self.output_dim, 1),
                                 initializer=self.weights_initializer,
                                 trainable=True)

        super(GatedRelevanceNetwork, self).build(input_shape)

    def call(self, x):
        e1 = x[0]
        e2 = x[1]

        batch_size = K.shape(e1)[0]
        # Usually len1 = len2 = max_seq_length
        _, len1, emb_dim = K.int_shape(e1)
        _, len2, _ = K.int_shape(e2)

        # Repeating the matrices to generate all the combinations
        ne1 = K.reshape(K.repeat_elements(K.expand_dims(e1, axis=2), len2, axis=2),
                       (batch_size, len1*len2, emb_dim))
        ne2 = K.reshape(K.repeat_elements(K.expand_dims(e2, axis=1), len1, axis=1),
                       (batch_size, len1*len2, emb_dim))

        # Repeating the second matrix to use in Bilinear Tensor Product
        ne2_k = K.repeat_elements(K.expand_dims(ne2, axis=-1), self.output_dim, axis=-1)

        # Bilinear tensor product
        btp = K.sum(ne2_k * K.permute_dimensions(K.dot(ne1, self.Wb), (0,1,3,2)), axis=2)
        btp = K.reshape(btp, (batch_size, len1, len2, self.output_dim))

        # Concatenating inputs to apply Single Layer Network
        e = K.concatenate([ne1, ne2], axis=-1)

        # Single Layer Network
        #sln = K.relu(K.dot(e, self.Wd))
        sln = K.tanh(K.dot(e, self.Wd))
        sln = K.reshape(sln, (batch_size, len1, len2, self.output_dim))

        # Gate
        g = K.sigmoid(K.dot(e, self.Wg) + self.bg)
        g = K.reshape(g, (batch_size, len1, len2, self.output_dim))

        # Gated Relevance Network
        #s = K.reshape(K.dot(g*btp + (1-g)*sln + self.b, self.u), (batch_size, len1, len2))
        s = K.dot(g*btp + (1-g)*sln + self.b, self.u)

        return s

    def compute_output_shape(self, input_shape):
        shape1 = input_shape[0]
        shape2 = input_shape[1]
        return (shape1[0], shape1[1], shape2[1], 1)

In [20]:
def create_model(input_shape,
                      embeddings_dim, embeddings_matrix, vocab_size,
                      max_seq_length, trainable_embeddings, dropout,
                      lstm_hidden_units, attention_channels, pool_size,
                      fc_hidden_units):
   
    X1_input = Input(input_shape, name="input_X1")
    X2_input = Input(input_shape, name="input_X2")

    # Encoding the inputs using the same weights
    # Output shape: (batch_size, max_seq_length, lstm_hidden_units)
    embeddor = Embedding(vocab_size,
                    embeddings_dim,
                    weights=[embeddings_matrix],
                    input_length=input_shape[0],
                    trainable=trainable_embeddings,
                    mask_zero=False)
    X1 = embeddor(X1_input)
    X2 = embeddor(X2_input)

    encoder = Bidirectional(LSTM(lstm_hidden_units, return_sequences=True))

    # Output shape: (batch_size, max_seq_length, lstm_hidden_units)
    X1_encoded = encoder(X1)
    X2_encoded = encoder(X2)

    # Attention matrix
    # Output shape: (batch_size, max_seq_length, max_seq_length, 1)
    X = GatedRelevanceNetwork(attention_channels, name="grn")([X1_encoded, X2_encoded])
    #X = BatchNormalization()(X)

    # Non-overlapping 2D max pooling
    # Output shape: (batch_size, pooled_rows, pooled_cols, 1)
    print("shape before pool", X.shape)
    X = MaxPooling2D(pool_size=(pool_size, pool_size),
                        strides=(pool_size, pool_size),
                        padding='valid',
                        data_format="channels_last",
                        name="max_pool")(X)
    X = Flatten()(X)

    # Multi-Layer Perceptron
    #X = Dropout(dropout)(X)
    X = Dense(fc_hidden_units, activation="tanh", name="mlp")(X)
    X = Dropout(dropout)(X)
    X = Dense(2, activation="softmax", name="output")(X)

    model = Model(inputs=[X1_input, X2_input], outputs=X, name="GRN_model")
    # Compiling model
    #optimizer = optimizers.Adam(lr=0.001)
    optimizer = optimizers.RMSprop()
    model.compile(optimizer=optimizer,
                loss="binary_crossentropy",
                metrics=["accuracy"])
    return model

In [21]:
dropout = 0.5
trainable_embeddings = False
lstm_hidden_units = 50
attention_channels = 2
pool_size = 3
fc_hidden_units = 128
use_class_weight = False
input_shape = (max_len,)
model = create_model(input_shape,
                      embedding_dim, embedding_matrix, vocab_size,
                      max_len, trainable_embeddings, dropout,
                      lstm_hidden_units, attention_channels, pool_size,
                      fc_hidden_units)
model.summary()

shape before pool (None, 25, 25, 1)
Model: "GRN_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_X1 (InputLayer)       [(None, 25)]                 0         []                            
                                                                                                  
 input_X2 (InputLayer)       [(None, 25)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 25, 200)              5404400   ['input_X1[0][0]',            
                                                                     'input_X2[0][0]']            
                                                                                                  
 bidirectional (Bidirection  (None, 25, 100)          

In [23]:
reduce_alpha = ReduceLROnPlateau(monitor ='val_loss', factor = 0.2, patience = 1, min_lr = 0.001)
# stop traning if there increase in loss
callbacks = [reduce_alpha] 

In [24]:
epochs = 30
batch_size = 64
history = model.fit(x=[q1_X_train, q2_X_train],
                    y=to_categorical(y_train),
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=([q1_X_test, q2_X_test], to_categorical(y_test)),callbacks=callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [25]:
outputpath = '/kaggle/working/'
filepath = outputpath+'model_paraprase_detection_pad.h5'
model.save_weights(filepath)

In [35]:
print("Evaluation (loss, acc)")
loss, acc = model.evaluate(x=[q1_X_test, q2_X_test], y=to_categorical(y_test))
# print("loss: {:.4f}   acc: {:.4f}".format(loss, acc))
pred = np.argmax(model.predict(x=[q1_X_test, q2_X_test]), axis=1)
f1 = f1_score(y_test, pred)
print("f1 score : {:.4f}".format(f1))
print("confusion matrix : ")
cf_mat = confusion_matrix(y_test, pred)
print(cf_mat)

Evaluation (loss, acc)
f1 score : 0.6073
confusion matrix : 
[[1732  776]
 [ 503  989]]


In [45]:
# Testing the model 
# Import necessary libraries
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Load the saved model
model = create_model(input_shape,
                      embedding_dim, embedding_matrix, vocab_size,
                      max_len, trainable_embeddings, dropout,
                      lstm_hidden_units, attention_channels, pool_size,
                      fc_hidden_units)

model.load_weights(filepath)  # Replace with the actual path

# Assume you have a new set of test data in a dictionary format
new_test_data_dict = {
    'question1': ["How does photosynthesis work?", "What are the benefits of exercise?", "Python vs Java",],
    'question2': ["what is the working mechanism of photosynthesis.", "where is the closest supermarket around here?", "Comparison between Python and Java"],
    'is_duplicate': [1, 0, 1]  # The true labels (1 for duplicate, 0 for non-duplicate)
}

# Create a DataFrame
new_test_data = pd.DataFrame(new_test_data_dict)

# Preprocess the test data (similar to what you did for training data)
new_test_data["question1"] = new_test_data["question1"].apply(lambda x: clean_question(x))
new_test_data["question2"] = new_test_data["question2"].apply(lambda x: clean_question(x))

# Tokenize and pad sequences
q1_texts_seq_test = tokenizer.texts_to_sequences(new_test_data["question1"].values)
q2_texts_seq_test = tokenizer.texts_to_sequences(new_test_data["question2"].values)

q1_texts_seq_test = pad_sequences(q1_texts_seq_test, maxlen=max_len)
q2_texts_seq_test = pad_sequences(q2_texts_seq_test, maxlen=max_len)

# Assuming 'is_duplicate' is the column with true labels
true_labels = new_test_data["is_duplicate"].values
print(q1_texts_seq_test)
# Make predictions
predictions = model.predict([q1_texts_seq_test, q2_texts_seq_test])
print(predictions)
# Assuming your model outputs probabilities for each class (binary classification)
# If you used softmax activation in the output layer, you can use argmax to get the predicted class
predicted_labels = np.argmax(predictions, axis=1)

# Convert predicted probabilities to percentage values
percentage_predictions = predictions[:, 1] * 100  # Assuming index 1 corresponds to the positive class

# Print the percentage predictions
print("Percentage Predictions:", percentage_predictions)


# Evaluate the predictions
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Print results with more decimal places
print("Accuracy: {:.6f}".format(accuracy))
print("F1 Score: {:.6f}".format(f1))
print("Confusion Matrix:")
print(conf_matrix)

def getSimilarity(s1, s2):
    cleanedS1 = clean_question(s1)
    cleanedS2 = clean_question(s2)


shape before pool (None, 25, 25, 1)
[[1.3725557e-06 9.9999857e-01]]
Percentage Predictions: [99.999855]
Accuracy: 0.000000
F1 Score: 0.000000
Confusion Matrix:
[[0 1]
 [0 0]]


In [85]:
import math
def getSimilarity(s1, s2):
    test_data = {
        's1': [s1],
        's2': [s2],
    }
    test_data_frame = pd.DataFrame(test_data)
    test_data_frame["s1"] = test_data_frame["s1"].apply(lambda x: clean_question(x))
    test_data_frame["s2"] = test_data_frame["s2"].apply(lambda x: clean_question(x))

    s1_texts_seq_test = tokenizer.texts_to_sequences(test_data_frame["s1"].values)
    s2_texts_seq_test = tokenizer.texts_to_sequences(test_data_frame["s2"].values)

    s1_texts_seq_test = pad_sequences(s1_texts_seq_test, maxlen=max_len)
    s2_texts_seq_test = pad_sequences(s2_texts_seq_test, maxlen=max_len)

    assessment = model.predict([s1_texts_seq_test, s2_texts_seq_test])
    assessmentP = assessment[:, 1] * 100
    similarity= assessmentP.astype(float)[0]
    similarity = math.trunc(similarity*100)/100
    lable = "Not Paraphrase"
    if similarity > 70:
        lable = "Paraphrase"
    return similarity, lable

In [86]:
import math
s1 = "How does photosynthesis work?"
s2 = "what is the working mechanism of photosynthesis."

similarity = getSimilarity(s1, s2)
print("Similarity:", similarity)

Similarity: (99.99, 'Paraphrase')


In [87]:
s1 = "What are the benefits of exercise?"
s2 = "where is the closest supermarket around here?"

similarity = getSimilarity(s1, s2)
print("Similarity:", similarity)

Similarity: (0.0, 'Not Paraphrase')
