In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/quora-question-dataset/train.csv')
df.shape

In [None]:
df.head()

In [None]:
df.value_counts('is_duplicate')

In [None]:
#here since the data is quite unbalanced, we need to balance the data but undersampling
class_0 = df[df['is_duplicate'] == 0 ]   ## majority class
class_1 = df[df['is_duplicate'] == 1 ]   ## minority class


class_0_undersampled = class_0.sample(n = len(class_1) , random_state = 42)

sampled_df = pd.concat([class_0_undersampled, class_1], axis = 0).sample(frac=1, random_state=42)

sampled_df



In [None]:
##data pre processing
def preprocess(q):
    #lowercase and removed leading and trailing spaces
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern,' ', q).strip()

    
    return q

In [None]:
preprocess("I've already! wasn't <b>done</b>?")

In [None]:
sampled_df['question1'] = sampled_df['question1'].apply(preprocess)
sampled_df['question2'] = sampled_df['question2'].apply(preprocess)

In [None]:
sampled_df.head()

In [None]:
#checking if there is no null values in our dataset
sampled_df.isnull().sum()

In [None]:
#checking for duplicate rows
df.duplicated().sum()

In [15]:
!pip install  nltk



In [None]:
#removing stop words from our dataset
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
STOP_WORDS = stopwords.words("english")

In [16]:
#removing stopwords from our dataset

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)  # Tokenize and convert to lowercase
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


sampled_df['question1'] = sampled_df["question1"].apply(remove_stopwords)
sampled_df['question2'] = sampled_df["question2"].apply(remove_stopwords)


In [17]:
sampled_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
364720,364720,494790,484904,god hate gays made,god hates homosexuality make possible,1
221233,221233,328525,328526,happens photon hits retina eye,photon go hitting retina,1
24938,24938,46524,46525,brink another battle united north jon snow kni...,possible jon snow sansa discover petyr role ed...,0
297158,297158,419495,270417,lose excessive fat two weeks,lose 5kg fat within two weeks,0
212447,212447,9163,46561,best coaching institutes gmat delhi ncr,coaching institute best gmat delhi ncr locatio...,1


In [18]:
#dropping the unwanted columns
final_df = sampled_df.drop(columns = ['id' , 'qid1' , 'qid2'])


In [19]:
x = final_df[['question1' , 'question2']]
y = final_df[['is_duplicate']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [20]:
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam

In [21]:
x["question1"] = x["question1"].apply(lambda x: str(x).lower() if isinstance(x, str) else "")
x["question2"] = x["question2"].apply(lambda x: str(x).lower() if isinstance(x, str) else "")

In [22]:
glove_file = "/kaggle/input/glove-embeddings/glove.6B.300d.txt"
embedding_dim = 300
embeddings_index = {}

with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vectors


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train["question1"].tolist() + X_train["question2"].tolist())
tokenizer.fit_on_texts(X_test["question1"].tolist() + X_test["question2"].tolist())
word_index = tokenizer.word_index





In [23]:
q1_sequences_train = tokenizer.texts_to_sequences(X_train["question1"])
q2_sequences_train = tokenizer.texts_to_sequences(X_train["question2"])
q1_sequences_test = tokenizer.texts_to_sequences(X_test["question1"])
q2_sequences_test = tokenizer.texts_to_sequences(X_test["question2"])

In [24]:
print(len(q1_sequences_test))
print(len(y_test))

59706
59706


In [25]:
max_len1 = max(len(seq) for seq in q1_sequences_train)
max_len2 = max(len(seq) for seq in q2_sequences_train)
max_len3 = max(len(seq) for seq in q2_sequences_test)
max_len4 = max(len(seq) for seq in q2_sequences_test)
max_len = max(max_len1 , max_len2 ,max_len3,max_len4 )



In [26]:
q1_padded_train = pad_sequences(q1_sequences_train, maxlen=max_len, padding="post")
q2_padded_train = pad_sequences(q2_sequences_train, maxlen=max_len, padding="post")
q1_padded_test = pad_sequences(q1_sequences_test, maxlen=max_len, padding="post")
q2_padded_test = pad_sequences(q2_sequences_test, maxlen=max_len, padding="post")
x1_train = [q1_padded_train , q2_padded_train]
x1_test = [q1_padded_test , q2_padded_test]



In [27]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # Detect TPU
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)  # TPU strategy
    print("Running on TPU")
except ValueError:
    strategy = tf.distribute.get_strategy()  # Default strategy for CPU/GPU
    print("Running on GPU/CPU")

with strategy.scope():
    # Use a floating-point dtype and MEAN aggregation
    var = tf.Variable(initial_value=0.0, dtype=tf.float32, aggregation=tf.VariableAggregation.MEAN)
    print("Distributed variable created successfully:", var)

In [29]:

    input_q1 = Input(shape=(max_len,))
    input_q2 = Input(shape=(max_len,))

    # Shared embedding layer (frozen)
    embedding_layer = Embedding(
        input_dim=len(word_index) + 1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False,
    )

# Embedding layers for both questions
    embedded_q1 = embedding_layer(input_q1)
    embedded_q2 = embedding_layer(input_q2)

# LSTM layers
    lstm_q1 = Bidirectional(LSTM(64, return_sequences=True))(embedded_q1)
    lstm_q2 = Bidirectional(LSTM(64, return_sequences=True))(embedded_q2)

# Attention mechanism
    attention_q1 = GlobalMaxPooling1D()(lstm_q1)
    attention_q2 = GlobalMaxPooling1D()(lstm_q2)

# Combine features
    combined = Concatenate()([attention_q1, attention_q2])

# Dense layers
    dense = Dense(128, activation="relu")(combined)
    dropout = Dropout(0.3)(dense)
    output = Dense(1, activation="sigmoid")(dropout)

    # Create the model
    model = Model(inputs=[input_q1, input_q2], outputs=output)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])




model.summary()



In [None]:
X_train.shape

In [31]:
from tensorflow.keras.callbacks import EarlyStopping

In [32]:

early_stopping = EarlyStopping(
    monitor='val_loss',    # Monitor validation loss
    patience=5,            # Stop training after 3 epochs with no improvement
    restore_best_weights=True  # Restore the weights of the best epoch
)

# Train the model with early stopping


histoty = model.fit([x1_train[0], x1_train[1]], y_train, validation_data=([x1_test[0], x1_test[1]], y_test), epochs=100, batch_size=32 , callbacks=[early_stopping])

Epoch 1/100
[1m7464/7464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 27ms/step - accuracy: 0.6966 - loss: 0.5686 - val_accuracy: 0.7576 - val_loss: 0.4886
Epoch 2/100
[1m7464/7464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 28ms/step - accuracy: 0.8094 - loss: 0.4048 - val_accuracy: 0.7785 - val_loss: 0.4590
Epoch 4/100
[1m7464/7464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 28ms/step - accuracy: 0.8739 - loss: 0.2843 - val_accuracy: 0.7850 - val_loss: 0.4986
Epoch 7/100
[1m7464/7464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 28ms/step - accuracy: 0.8905 - loss: 0.2526 - val_accuracy: 0.7851 - val_loss: 0.5185
Epoch 8/100
[1m7464/7464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 28ms/step - accuracy: 0.9107 - loss: 0.2115 - val_accuracy: 0.7864 - val_loss: 0.5742
