In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

## About this file
- There are over 400,000 lines of potential question duplicate pairs. Each line contains IDs for each question in the pair, the full text for each question, and a binary value that indicates whether the line truly contains a duplicate pair.
 - yes(1) 
 - no(0)

In [2]:
task = pd.read_csv('questions.csv')

In [3]:
task.shape

(404351, 6)

In [4]:
task.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [5]:
task_new = task[0:100]

Keeping low records

In [6]:
task_new.shape

(100, 6)

In [7]:
task_new.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
task_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            100 non-null    int64 
 1   qid1          100 non-null    int64 
 2   qid2          100 non-null    int64 
 3   question1     100 non-null    object
 4   question2     100 non-null    object
 5   is_duplicate  100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB


- No null values.
- RangeIndex: 100 entries, 0 to 99
- Datatypes can also be observed 

In [9]:
task_new.duplicated().sum()

0

No duplicated rows in the dataset

In [10]:
print(task_new['is_duplicate'].value_counts())
print("\n")
print(task_new['is_duplicate'].value_counts()/task_new['is_duplicate'].value_counts().count()*100)

0    65
1    35
Name: is_duplicate, dtype: int64


0    3250.0
1    1750.0
Name: is_duplicate, dtype: float64


In [11]:
task_new.drop(['id','qid1','qid2'], axis =1, inplace=True)

In [12]:
task_new['question1'] = task_new['question1'].str.lower()
task_new['question2'] = task_new['question2'].str.lower()

In [13]:
stop_word = set(stopwords.words('english'))
task_new["question1"] = task_new["question1"].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_word))
task_new["question2"] = task_new["question2"].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_word))

In [14]:
task_new['question1'] = task_new['question1'].str.replace('\d+','')
task_new['question2'] = task_new['question2'].str.replace('\d+','')
task_new['question1'] = task_new['question1'].str.replace(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*','')
task_new['question2'] = task_new['question2'].str.replace(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*','')

In [15]:
task_new.head()

Unnamed: 0,question1,question2,is_duplicate
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve it,find remainder mathmath divided,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [17]:
questions = list(task_new['question1']) + list(task_new['question2'])

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

Creating a final data frame

In [19]:
new_data1 = pd.DataFrame(q1_arr, index=task_new.index)
new_data2 = pd.DataFrame(q2_arr, index=task_new.index)
new_data = pd.concat([new_data1, new_data2], axis=1)

In [20]:
new_data['is_duplicate'] = task_new['is_duplicate']

In [21]:
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,is_duplicate
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [22]:
x = new_data.iloc[:,0:-1]

In [23]:
y = new_data.iloc[:,-1]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [25]:
model1 = MultinomialNB()
model2 = RandomForestClassifier()

In [26]:
model1.fit(x_train,y_train)
predict = model1.predict(x_test)

In [27]:
print("Final score:- ", accuracy_score(y_test,predict))

Final score:-  0.3


In [28]:
model2.fit(x_train,y_train)
predict1 = model2.predict(x_test)

In [29]:
print("Final score for Random Forest:- ", accuracy_score(y_test,predict1))

Final score for Random Forest:-  0.75


In [30]:
print("Classification Report:- \n", classification_report(y_test,predict1))

Classification Report:- 
               precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       0.00      0.00      0.00         5

    accuracy                           0.75        20
   macro avg       0.38      0.50      0.43        20
weighted avg       0.56      0.75      0.64        20



In [31]:
print("confusion matrix:- \n", confusion_matrix(y_test,predict))

confusion matrix:- 
 [[ 3 12]
 [ 2  3]]


In [32]:
print("confusion matrix:- \n", confusion_matrix(y_test,predict1))

confusion matrix:- 
 [[15  0]
 [ 5  0]]
