In [14]:
import pandas as pd
from nltk.tokenize import word_tokenize as wt 
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
import spacy

In [15]:
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

In [16]:
df_qs= pd.read_csv('questions.csv',header=0)
new_qs_df = df_qs.sample(100000)
new_qs_df.drop(['id'],axis=1,inplace=True)

In [17]:
def remove_stopwords(text):
    stopWords = set(stopwords.words('english'))
    words = wt(text)
    filtered_words = [word for word in words if word.lower() not in stopWords]
    return ' '.join(filtered_words)

In [18]:
new_qs_df['question1']=new_qs_df['question1'].apply(lambda x: str(x).replace('.', '').replace(',', '').replace("'", '').replace('"', '').replace('-',' ').replace('&','and').replace('?',''))
new_qs_df['question2']=new_qs_df['question2'].apply(lambda x: str(x).replace('.', '').replace(',', '').replace("'", '').replace('"', '').replace('-',' ').replace('&','and').replace('?',''))

In [19]:
new_qs_df['question1']=new_qs_df['question1'].apply(lambda x:remove_stopwords(x))
new_qs_df['question2']=new_qs_df['question2'].apply(lambda x:remove_stopwords(x))

In [20]:
new_qs_df.head(5)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
15347,30634,30635,people QUORA ask questions easily findout Google,dont Quora people look answer Google,1
112052,222174,222175,time travel anyhow,time travel possible next 10 years,1
169380,334723,334724,requirements become President United States re...,requirements become president United States re...,0
165951,328009,328010,copilotsearchcom,Yellowlegcom,0
392998,768078,768079,future mental health treatment,ways mental health professions change next 20 ...,1


<h1>feature engineering</h1>
<h3>New Features</h3>
<li>total characters ✅</li>
<li>total words ✅</li>
<li>Common word count ✅</li>
<li>sentence character Length difference ✅ </li>
<li>difference in number of words ✅</li>


In [21]:
def total_characters(text):
    return len(text)
def total_words(text):
    return len(text.split())
def common_words(text):
    return len(set(text['question1'].split()) & set(text['question2'].split()))
def chara_len_diff(text):
    return abs(len(''.join(wt(text['question1'])))-len(''.join(wt(text['question2']))))
def total_word_count_diff(text):
    return abs(len(wt(text['question1']))-len(wt(text['question2'])))
# print(common_words(new_qs_df[243945]))
# print(total_words(new_qs_df['question1'][152007]),' ',total_words(new_qs_df['question2'][152007]))


In [22]:
# x=new_qs_df.sample(5)
# x['word diff cnt']=x.apply(lambda x:total_word_count_diff(x),axis=1)
# x[['question1','question2','word diff cnt']]

In [25]:
new_qs_df['question1']=new_qs_df['question1'].str.lower()
new_qs_df['question2']=new_qs_df['question2'].str.lower()

In [26]:
new_qs_df['total_characters_q1']=new_qs_df['question1'].apply(lambda x:total_characters(x))
new_qs_df['total_characters_q2']=new_qs_df['question2'].apply(lambda x:total_characters(x))
new_qs_df['total_words_q1']=new_qs_df['question1'].apply(lambda x:total_words(x))
new_qs_df['total_words_q2']=new_qs_df['question2'].apply(lambda x:total_words(x))
new_qs_df['total_common_words']=new_qs_df.apply(lambda x:common_words(x),axis=1)
new_qs_df['chara_len_diff']=new_qs_df.apply(lambda x:chara_len_diff(x),axis=1)
new_qs_df['word_diff_count']=new_qs_df.apply(lambda x:total_word_count_diff(x),axis=1)

In [27]:
new_qs_df

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate,total_characters_q1,total_characters_q2,total_words_q1,total_words_q2,total_common_words,chara_len_diff,word_diff_count
15347,30634,30635,people quora ask questions easily findout google,dont quora people look answer google,1,48,36,7,6,3,11,1
112052,222174,222175,time travel anyhow,time travel possible next 10 years,1,18,34,3,6,2,13,3
169380,334723,334724,requirements become president united states re...,requirements become president united states re...,0,73,78,8,8,6,5,0
165951,328009,328010,copilotsearchcom,yellowlegcom,0,16,12,1,1,0,4,0
392998,768078,768079,future mental health treatment,ways mental health professions change next 20 ...,1,30,51,4,8,2,17,4
...,...,...,...,...,...,...,...,...,...,...,...,...
222812,439173,439174,get list gmail accounts phone march 2015,get list gmail account phone march 2015,1,40,39,7,7,6,1,0
35732,71197,71198,china help nepal,help nepal,0,16,10,3,2,2,5,1
109585,217309,217310,cold gobi desert get average temperatures comp...,cold gobi desert get average temperatures comp...,1,68,73,10,11,8,4,1
312023,612460,612461,families inter religion marriage failed convin...,done families inter religion marriage failed c...,1,57,62,7,8,6,4,1


In [28]:
def lemma(text):
    lemm = [word.lemma_ for word in nlp(text)]
    return ' '.join(lemm)

In [29]:
new_qs_df['question1']=new_qs_df['question1'].apply(lambda x:lemma(x))
new_qs_df['question2']=new_qs_df['question2'].apply(lambda x:lemma(x))

In [None]:
# new_qs_df['question1']=new_qs_df['question1'].apply(lambda x: wt(x))
# new_qs_df['question2']=new_qs_df['question2'].apply(lambda x: wt(x))

In [30]:
ques=list(new_qs_df['question1'])+list(new_qs_df['question2'])

In [31]:
cv = CountVectorizer(max_features=3000)
q1_ar,q2_ar = np.vsplit(cv.fit_transform(ques).toarray(),2)


In [32]:
tdf1=pd.DataFrame(q1_ar,index=new_qs_df.index)
tdf2=pd.DataFrame(q2_ar,index=new_qs_df.index)
tdf = pd.concat([tdf1,tdf2],axis=1)

In [33]:
final_df = new_qs_df.drop(columns=['qid1','qid2','question1','question2'])

In [None]:
# tdf.shape

In [34]:
final_df=pd.concat([final_df, tdf], axis=1)
final_df.shape

(100000, 6008)

In [None]:
# final_df.sample(5)

In [35]:
X_train,X_test,y_train,y_test = train_test_split(final_df.iloc[:,1:].values,final_df.iloc[:,0].values,test_size=0.3,random_state=1)

In [36]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7828333333333334

In [37]:

xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.7661

In [38]:
import pickle 


In [40]:
with open('Quora_rfmodel.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('Quora_XGBmodel.pkl', 'wb') as f:
    pickle.dump(xgb, f)