In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import *

In [2]:
train_df = pd.read_csv("msr_paraphrase_train.csv")
train_df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [3]:
train_df.shape

(4076, 5)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4076 entries, 0 to 4075
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Quality    4076 non-null   int64 
 1   #1 ID      4076 non-null   int64 
 2   #2 ID      4076 non-null   int64 
 3   #1 String  4076 non-null   object
 4   #2 String  4045 non-null   object
dtypes: int64(3), object(2)
memory usage: 159.3+ KB


In [5]:
# Checking for duplicate rows in the dataset
duplicates = train_df.duplicated()
print("Number of duplicate rows:", duplicates.sum())

# Check for nulls
nulls = train_df.isnull()
print("Number of missing values :\n", nulls.sum())

Number of duplicate rows: 0
Number of missing values :
 Quality       0
#1 ID         0
#2 ID         0
#1 String     0
#2 String    31
dtype: int64


In [6]:
# Drop the null values
train_df.dropna(inplace=True)
nulls = train_df.isnull()
print("Number of missing values :\n", nulls.sum())

Number of missing values :
 Quality      0
#1 ID        0
#2 ID        0
#1 String    0
#2 String    0
dtype: int64


In [7]:
# Distribution of Similar and non-similar strings
fig = px.pie(train_df, values='#1 ID', names='Quality', title='Class Distribution for Train Data')
fig.show()

In [8]:
# Cleaning function for the strings
wn = nltk.WordNetLemmatizer()

def clean_string(input_str):
    
    # Lowercase the input_string
    input_str = input_str.lower()
    
    # Remove URLs, links
    input_str = re.sub(r"http\S+", "", input_str)
    input_str = re.sub(r"www.\S+", "", input_str)
    input_str = re.sub(r"\S+@\S+", "", input_str)
    
    # Remove punctuations
    input_str_punc = "".join(char for char in input_str if char not in string.punctuation)

    # Remove stopwords
    stopword = nltk.corpus.stopwords.words('english')
    input_str_stopwords = " ".join([word for word in re.split('\W+', input_str_punc) if word not in stopword])
    
    # Lemmatization
    input_str_cleaned = " ".join([wn.lemmatize(word,'n') for word in re.split('\W+', input_str_stopwords)])

    return input_str_cleaned

In [9]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Apply cleaning function on the #1 and #2 String columns in Train Data
train_df['sentence_1'] = train_df['#1 String'].apply(clean_string)
train_df['sentence_2'] = train_df['#2 String'].apply(clean_string)
train_df[['#1 String', 'sentence_1', '#2 String', 'sentence_2']].head()

Unnamed: 0,#1 String,sentence_1,#2 String,sentence_2
0,"Amrozi accused his brother, whom he called ""th...",amrozi accused brother called witness delibera...,"Referring to him as only ""the witness"", Amrozi...",referring witness amrozi accused brother delib...
1,Yucaipa owned Dominick's before selling the ch...,yucaipa owned dominick selling chain safeway 1...,Yucaipa bought Dominick's in 1995 for $693 mil...,yucaipa bought dominick 1995 693 million sold ...
2,They had published an advertisement on the Int...,published advertisement internet june 10 offer...,"On June 10, the ship's owners had published an...",june 10 ship owner published advertisement int...
3,"Around 0335 GMT, Tab shares were up 19 cents, ...",around 0335 gmt tab share 19 cent 44 a456 earl...,"Tab shares jumped 20 cents, or 4.6%, to set a ...",tab share jumped 20 cent 46 set record closing...
4,"The stock rose $2.11, or about 11 percent, to ...",stock rose 211 11 percent close friday 2151 ne...,PG&E Corp. shares jumped $1.63 or 8 percent to...,pge corp share jumped 163 8 percent 2103 new y...


In [11]:
test_df = pd.read_csv("msr_paraphrase_test.csv")
test_df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,1,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,0,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,0,3354381,3354396,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...


In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1725 entries, 0 to 1724
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Quality    1725 non-null   int64 
 1   #1 ID      1725 non-null   int64 
 2   #2 ID      1725 non-null   int64 
 3   #1 String  1725 non-null   object
 4   #2 String  1713 non-null   object
dtypes: int64(3), object(2)
memory usage: 67.5+ KB


In [13]:
# Checking for duplicate rows in the dataset
duplicates = test_df.duplicated()
print("Number of duplicate rows:", duplicates.sum())

# Check for nulls
nulls = test_df.isnull()
print("Number of missing values :\n", nulls.sum())

Number of duplicate rows: 0
Number of missing values :
 Quality       0
#1 ID         0
#2 ID         0
#1 String     0
#2 String    12
dtype: int64


In [14]:
# Drop the null values
test_df.dropna(inplace=True)
nulls = test_df.isnull()
print("Number of missing values :\n", nulls.sum())

Number of missing values :
 Quality      0
#1 ID        0
#2 ID        0
#1 String    0
#2 String    0
dtype: int64


In [15]:
# Distribution of Similar and non-similar strings
fig = px.pie(test_df, values='#1 ID', names='Quality', title='Class Distribution for Test Data')
fig.show()

In [16]:
# Apply cleaning function on the #1 and #2 String columns in Test Data
test_df['sentence_1'] = test_df['#1 String'].apply(clean_string)
test_df['sentence_2'] = test_df['#2 String'].apply(clean_string)
test_df[['#1 String', 'sentence_1', '#2 String', 'sentence_2']].head()

Unnamed: 0,#1 String,sentence_1,#2 String,sentence_2
0,"PCCW's chief operating officer, Mike Butcher, ...",pccws chief operating officer mike butcher ale...,Current Chief Operating Officer Mike Butcher a...,current chief operating officer mike butcher g...
1,The world's two largest automakers said their ...,world two largest automaker said u sale declin...,Domestic sales at both GM and No. 2 Ford Motor...,domestic sale gm 2 ford motor co declined pred...
2,According to the federal Centers for Disease C...,according federal center disease control preve...,The Centers for Disease Control and Prevention...,center disease control prevention said 19 repo...
3,A tropical storm rapidly developed in the Gulf...,tropical storm rapidly developed gulf mexico s...,A tropical storm rapidly developed in the Gulf...,tropical storm rapidly developed gulf mexico s...
4,The company didn't detail the costs of the rep...,company didnt detail cost replacement repair,But company officials expect the costs of the ...,company official expect cost replacement work ...


In [17]:
# Prepare the data for GloVe embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_df['sentence_1'].values.astype(str))+list(train_df['sentence_2'].values.astype(str)))

# Convert the training sentences into sequences and padding them to max length of 50
max_len = 50
train_str_1 = tokenizer.texts_to_sequences(train_df['sentence_1'].values.astype(str))
train_str_1 = pad_sequences(train_str_1, maxlen = max_len, padding = 'post')

train_str_2 = tokenizer.texts_to_sequences(train_df['sentence_2'].values.astype(str))
train_str_2 = pad_sequences(train_str_2, maxlen = max_len, padding = 'post')

# Convert the test sentences into sequences and padding them to max length of 50
test_str_1 = tokenizer.texts_to_sequences(test_df['sentence_1'].values.astype(str))
test_str_1 = pad_sequences(test_str_1, maxlen = max_len, padding = 'post')

test_str_2 = tokenizer.texts_to_sequences(test_df['sentence_2'].values.astype(str))
test_str_2 = pad_sequences(test_str_2, maxlen = max_len, padding = 'post')

In [18]:
word_index = tokenizer.word_index
len(word_index)

13119

In [19]:
# Get the glove embeddings
glove_dictionary = {}
with open('glove.6B.200d.txt') as file:
    for each_line in file:
        words_in_line, coeff_cients = each_line.split(maxsplit=1)
        coeff_cients = np.array(coeff_cients.split(),dtype = float)
        glove_dictionary[words_in_line] = coeff_cients

In [20]:
# Make the embedding matrix using Glove embeddings
embedding_matrix = np.zeros((len(word_index) + 1, 200))
for word, index in word_index.items():
    embedding_vector = glove_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index, :] = embedding_vector

embedding_matrix.shape

(13120, 200)

In [21]:
# Training the two models on their own
# Sentence 1 Model
model_s1 = tf.keras.Sequential()
model_s1.add(Embedding(input_dim = len(word_index)+1,
                      output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 50,
                      trainable = False))
model_s1.add(LSTM(128, activation = 'relu', return_sequences = True))
model_s1.add(Dropout(0.4))
model_s1.add(LSTM(128, return_sequences = True))
model_s1.add(Dropout(0.2))
model_s1.add(Dense(64, activation = 'relu'))
model_s1.add(Dropout(0.4))
model_s1.add(Dense(2, activation = 'sigmoid'))

# Sentence 2 Model
model_s2 = tf.keras.Sequential()
model_s2.add(Embedding(input_dim = len(word_index)+1,
                      output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 50,
                      trainable = False))
model_s2.add(LSTM(128, activation = 'relu', return_sequences = True))
model_s2.add(Dropout(0.4))
model_s2.add(LSTM(128, return_sequences = True))
model_s2.add(Dropout(0.2))
model_s2.add(Dense(64, activation = 'relu'))
model_s2.add(Dropout(0.4))
model_s2.add(Dense(2, activation = 'sigmoid'))



In [22]:
# Combine the output of the 2 models
# Merging the output of the two models,i.e, model_q1 and model_q2
merge_out = Multiply()([model_s1.output, model_s2.output])

merge_out = Flatten()(merge_out)
merge_out = Dense(128, activation = 'relu')(merge_out)
merge_out = Dropout(0.1)(merge_out)
merge_out = Dense(64, activation = 'relu')(merge_out)
merge_out = Dropout(0.2)(merge_out)
merge_out = Dense(32, activation = 'relu', kernel_regularizer = 'l2')(merge_out)
merge_out = Dropout(0.1)(merge_out)
merge_out = Dense(2, activation = 'sigmoid')(merge_out)

In [23]:
model = tf.keras.models.Model([model_s1.input, model_s2.input], merge_out)
print(model.summary())
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = model.fit([train_str_1, train_str_2], train_df['Quality'], batch_size = 128, epochs = 10, validation_data = ([test_str_1, test_str_2], test_df['Quality']))

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_input (InputLayer)   [(None, 50)]         0           []                               
                                                                                                  
 embedding_1_input (InputLayer)  [(None, 50)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 200)      2624000     ['embedding_input[0][0]']        
                                                                                                  
 embedding_1 (Embedding)        (None, 50, 200)      2624000     ['embedding_1_input[0][0]']      
                                                                                              