In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/keshavmittal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keshavmittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/keshavmittal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
data = {
    'question1': [
        'What is the best way to learn Python?',
        'How can I improve my coding skills?',
        'How to start learning Python programming?',
        'What is machine learning?',
        'How to cook pasta?',
        'What is the capital of France?',
        'How does a neural network work?'
    ],
    'question2': [
        'What’s the best method to learn Python programming?',
        'How do I get better at coding?',
        'What are the steps to begin Python coding?',
        'What does machine learning mean?',
        'How to make pasta dishes?',
        'What is the capital city of Brazil?',
        'How to cook Italian pasta?'
    ],
    'is_duplicate': [1, 1, 1, 1, 1, 0, 0]  # Added non-duplicate pairs (0)
}

df = pd.DataFrame(data)

In [35]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [37]:
# Apply preprocessing
df['question1_clean'] = df['question1'].apply(preprocess_text)
df['question2_clean'] = df['question2'].apply(preprocess_text)

In [38]:
# Feature extraction using TF-IDF
# Combine questions for vectorization
all_questions = pd.concat([df['question1_clean'], df['question2_clean']])

In [39]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(all_questions)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [40]:
# Transform questions
q1_tfidf = tfidf.transform(df['question1_clean'])
q2_tfidf = tfidf.transform(df['question2_clean'])

In [41]:
# Compute difference and product of TF-IDF vectors
X = np.abs(q1_tfidf - q2_tfidf)
X = X.multiply(q1_tfidf.multiply(q2_tfidf)).toarray()

In [42]:
# Labels
y = df['is_duplicate'].values

In [43]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [45]:
# Predict and evaluate
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [46]:
# Function to predict if two questions are duplicates
def predict_duplicate(q1, q2):
    q1_clean = preprocess_text(q1)
    q2_clean = preprocess_text(q2)
    q1_vec = tfidf.transform([q1_clean])
    q2_vec = tfidf.transform([q2_clean])
    features = np.abs(q1_vec - q2_vec)
    features = features.multiply(q1_vec.multiply(q2_vec)).toarray()
    return model.predict(features)[0]

In [47]:
# Example usage
q1 = 'How to learn Python quickly?'
q2 = 'What’s the fastest way to learn Python?'
result = predict_duplicate(q1, q2)
print(f'Questions: "{q1}" and "{q2}"')
print('Duplicate' if result == 1 else 'Not Duplicate')

Questions: "How to learn Python quickly?" and "What’s the fastest way to learn Python?"
Duplicate
