In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K

In [38]:
df = pd.read_csv('/content/questions.csv')
df.dropna(inplace=True)

In [39]:
# df = df_data.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [41]:
#preprocessing
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download stopwords once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Clean function
def preprocess(text):
    text = str(text).lower()                                  # lowercase
    text = re.sub(r"[^a-z0-9\s]", " ", text)                  # remove punctuation/special chars
    text = re.sub(r"\s+", " ", text).strip()                 # remove extra whitespace
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]  # remove stopwords and stem
    return " ".join(words)

# Apply to both columns
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

# Tokenization
tokenizer = Tokenizer(num_words=40000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['question1'].tolist() + df['question2'].tolist())

q1_seq = tokenizer.texts_to_sequences(df['question1'].tolist())
q2_seq = tokenizer.texts_to_sequences(df['question2'].tolist())

# Dynamically determine max_len (95th percentile length)
all_lens = [len(x) for x in q1_seq + q2_seq]
max_len = int(np.percentile(all_lens, 95))  # for example, 95% of samples fall under this length

# Padding
q1_pad = pad_sequences(q1_seq, maxlen=max_len, padding='post')
q2_pad = pad_sequences(q2_seq, maxlen=max_len, padding='post')

# Combine as final input
X = np.hstack((q1_pad, q2_pad))
y = df['is_duplicate'].values


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64, return_sequences=False, kernel_regularizer=regularizers.l2(0.01))))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))
model.build(input_shape=(None, 100))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [44]:
from tensorflow.keras.callbacks import EarlyStopping

In [45]:
early_stop = EarlyStopping(
    monitor='val_accuracy',    # or 'val_accuracy'
    patience=2,            # stop after 2 epochs with no improvement
    restore_best_weights=True
)


In [46]:
history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=128,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 10ms/step - accuracy: 0.7169 - loss: 0.8194 - val_accuracy: 0.7585 - val_loss: 0.4972
Epoch 2/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.7697 - loss: 0.4841 - val_accuracy: 0.7589 - val_loss: 0.4889
Epoch 3/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.7858 - loss: 0.4561 - val_accuracy: 0.7711 - val_loss: 0.4824
Epoch 4/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 9ms/step - accuracy: 0.8016 - loss: 0.4311 - val_accuracy: 0.7741 - val_loss: 0.4732
Epoch 5/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.8138 - loss: 0.4108 - val_accuracy: 0.7803 - val_loss: 0.4694
Epoch 6/30
[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - accuracy: 0.8242 - loss: 0.3943 - val_accuracy: 0.7797 - val_loss: 0.4737
Epoch 7/3

In [47]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m2528/2528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.7812 - loss: 0.4748
Test Accuracy: 0.7816


In [49]:
model.save("bilstm_duplicate_model.h5")



In [50]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [51]:
from google.colab import files
files.download("bilstm_duplicate_model.h5")
files.download("tokenizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [60]:
# Step 1: Import Required Libraries
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 2: Set max sequence length (must match training)
MAX_LEN = 30

# Step 3: Load Trained Model and Tokenizer
model = load_model("/content/bilstm_duplicate_model.h5")

with open("/content/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Step 4: Define Prediction Function
def predict_duplicate(q1, q2):
    # Step 4.1: Tokenize
    seq1 = tokenizer.texts_to_sequences([q1])
    seq2 = tokenizer.texts_to_sequences([q2])

    # Step 4.2: Pad sequences
    pad1 = pad_sequences(seq1, maxlen=MAX_LEN, padding='post')
    pad2 = pad_sequences(seq2, maxlen=MAX_LEN, padding='post')

    # Step 4.3: Combine side-by-side like training input
    combined = np.hstack((pad1, pad2)).astype(np.float32)

    # Step 4.4: Predict
    pred = model.predict(combined)[0][0]

    # Step 4.5: Format Result
    result = "Duplicate" if pred > 0.5 else "Not Duplicate"
    print(f"Prediction Score: {pred:.4f} → {result}")
    return result

# Step 5: Test the Function
q1 = " How do I start a startup in India?"
q2 = "What are the government regulations for new startups?"

predict_duplicate(q1, q2)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step
Prediction Score: 0.0006 → Not Duplicate


'Not Duplicate'

In [54]:
import os
print(os.path.exists("/content/bilstm_duplicate_model.h5"))  # Should print: True
print(os.path.exists("/content/tokenizer.pkl"))              # Should print: True


True
True
