In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import pandas as pd

# List of required columns
required_columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']

# Load the CSV
df = pd.read_csv(
    'q_quora.csv',
    usecols=required_columns,
    dtype={'id': int, 'qid1': int, 'qid2': int, 'question1': str, 'question2': str, 'is_duplicate': str},
    on_bad_lines='skip',         # skips badly formatted lines (Pandas 1.3+)
    low_memory=False
)


In [5]:
df.shape

(404351, 6)

In [7]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
new_df = df.sample(30000)

In [11]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [13]:
new_df = df.dropna(subset=['question1','question2'])

In [15]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [13]:
new_df.duplicated().sum()

0

In [15]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
375980,"What are good, safe, and cost effective places...",What places can a single woman travel to alone?
138295,Are black women discouraged when they are dubb...,Are Indian men attracted to black women?
324398,What are the benefits of reciting Hanuman Chal...,Why should we read Hanuman chalisa? What is th...
242878,What in your opinion is the best thing Jackie ...,Haw many sexual assaults did Clinton accumulate?
158503,What are some of the weaknesses of the Article...,What were the weaknesses of the Articles of Co...


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])
cv = CountVectorizer(max_features=3000)
q1_arr,q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [19]:
temp_df1 = pd.DataFrame(q1_arr, index=ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index=ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(30000, 6000)

In [21]:
temp_df['is_duplicate'] = df['is_duplicate']

In [23]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
375980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
138295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
324398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
335222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
166491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
335975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    temp_df.iloc[:, :-1].values,
    temp_df.iloc[:, -1].values,
    test_size=0.2,
    random_state=42
)


In [31]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7348333333333333