In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bs4 import BeautifulSoup
import shutil
import gensim
import re
import spacy
from gensim.models import Word2Vec

In [4]:
train_data = pd.read_csv('../input/quora-question-pairs/train.csv.zip')
train_data.head()

In [5]:
train_data.shape

In [6]:
sample_data = train_data.sample(25000, random_state=2)
sample_data.head()

In [7]:
sample_data.isnull().sum()

In [8]:
train_data.isnull().sum()

In [9]:
train_data = train_data.dropna().reset_index(drop=True)
train_data.isnull().sum()

In [10]:
q1 = ''.join(train_data['question1'])
q2 = ''.join(train_data['question2'])

In [11]:
print('Number of occurrence of data2:', q1.count('[/math]'))
print('Number of occurrence of data2:', q1.count('[math]'))
print('Number of occurrence of data1:', q2.count('[/math]'))
print('Number of occurrence of data1:', q2.count('[math]'))

In [12]:
sample_data[sample_data['is_duplicate']==0].head() #non dupication

In [13]:
sample_data[sample_data['is_duplicate']==1].head() #duplicate

In [14]:
print(round(sample_data.is_duplicate.value_counts()/len(sample_data)*100))
sample_data.is_duplicate.value_counts().plot(kind='bar')

#### label '1' denotes 37% the question is dupicate.
#### label '0' denotes 63% the question is non dupicate

In [15]:
# Repeated questions

qid = pd.Series(sample_data['qid1'].tolist() + sample_data['qid2'].tolist())
print('Number of unique questions',np.unique(qid).shape[0])
x = qid.value_counts()>1
print('Number of questions getting repeated',x[x].shape[0])

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns

# Repeated questions histogram

plt.hist(qid.value_counts().values,bins=160)
plt.yscale('log')
plt.show()

## Data Preprocessing and Cleaning

In [17]:
def preprocess(data):
    
    data = str(data).lower().strip()
    
    # Replace special characters with their string edatauivalents.
    data = data.replace('%', ' percent')
    data = data.replace('$', ' dollar ')
    data = data.replace('₹', ' rupee ')
    data = data.replace('€', ' euro ')
    data = data.replace('@', ' at ')
    data = data.replace('#', '')
    data = data.replace('u.s.', 'usa')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    data = data.replace('[math]', '')
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    data_decontracted = []

    for word in data.split():
        if word in contractions:
            word = contractions[word]

        data_decontracted.append(word)

    data = ' '.join(data_decontracted)
    data = data.replace("'ve", " have")
    data = data.replace("n't", " not")
    data = data.replace("'re", " are")
    data = data.replace("'ll", " will")
    
    # Removing HTML tags
    data = BeautifulSoup(data)
    data = data.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    data = re.sub(pattern, ' ', data).strip()

    
    return data

In [18]:
sample_data['question1'] = sample_data['question1'].apply(preprocess)
sample_data['question2'] = sample_data['question2'].apply(preprocess)

In [19]:
sample_data.head()

## Tokenization

In [20]:
tokenized_question1 = sample_data['question1'].apply(lambda x: x.split()) #tokenizing
tokenized_question1.head()

In [21]:
tokenized_question2 = sample_data['question2'].apply(lambda x: x.split()) #tokenizing
tokenized_question2.head()

## Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmaObj = WordNetLemmatizer()

In [23]:
lemma_tokenized_q1 = tokenized_question1.apply(lambda x: [lemmaObj.lemmatize(word,pos='v') for word in x])
lemma_tokenized_q1.head()

In [24]:
lemma_tokenized_q2 = tokenized_question2.apply(lambda x: [lemmaObj.lemmatize(word,pos='v') for word in x])
lemma_tokenized_q2.head()

In [25]:
sample_data['lemma_q1'] = lemma_tokenized_q1
sample_data['lemma_q1'] = sample_data['lemma_q1'].apply(lambda x: ' '.join(x))

In [26]:
sample_data['lemma_q2'] = lemma_tokenized_q2
sample_data['lemma_q2'] = sample_data['lemma_q2'].apply(lambda x: ' '.join(x))

In [27]:
sample_data.head()

In [28]:
new_df = sample_data.drop(['question1','question2'],axis=1)

In [29]:
new_df.head()

## Adding New Features
1. q1 length
2. q2 length
3. q1_num_words
4. q2_num_words
5. common words
6. total words
7. words share - common words/total words

In [30]:
new_df['q1_len'] = new_df['lemma_q1'].str.len()
new_df['q2_len'] = new_df['lemma_q2'].str.len()

In [31]:
new_df.head()

In [32]:
new_df['q1_num_words'] = new_df['lemma_q1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['lemma_q2'].apply(lambda row: len(row.split(" ")))

In [33]:
new_df.head()

In [34]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['lemma_q1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['lemma_q2'].split(" ")))    
    return len(w1 & w2)

In [35]:
new_df['common_words'] = new_df.apply(common_words,axis=1)
new_df.head()

In [36]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['lemma_q1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['lemma_q2'].split(" ")))    
    return (len(w1) + len(w2))

In [37]:
new_df['total_words'] = new_df.apply(total_words,axis=1)
new_df.head()

In [38]:
new_df['word_share'] = round(new_df['common_words']/new_df['total_words'],2)
new_df.head()

In [39]:
# Analysis of features
sns.displot(new_df['q1_len'])
print('minimum characters',new_df['q1_len'].min())
print('maximum characters',new_df['q1_len'].max())
print('average num of characters',int(new_df['q1_len'].mean()))

In [40]:
# common words
sns.distplot(new_df[new_df['is_duplicate'] == 0]['common_words'],label='non duplicate')
sns.distplot(new_df[new_df['is_duplicate'] == 1]['common_words'],label='duplicate')
plt.legend()
plt.show()

In [41]:
# total words
sns.distplot(new_df[new_df['is_duplicate'] == 0]['total_words'],label='non duplicate')
sns.distplot(new_df[new_df['is_duplicate'] == 1]['total_words'],label='duplicate')
plt.legend()
plt.show()

In [42]:
# word share
sns.distplot(new_df[new_df['is_duplicate'] == 0]['word_share'],label='non duplicate')
sns.distplot(new_df[new_df['is_duplicate'] == 1]['word_share'],label='duplicate')
plt.legend()
plt.show()

## BOW - CountVectorizer

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(new_df['lemma_q1']) + list(new_df['lemma_q2'])

cv = CountVectorizer(max_features=3000,stop_words='english')
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [44]:
q1_arr_df = pd.DataFrame(q1_arr, index = new_df.index)
q2_arr_df = pd.DataFrame(q2_arr, index = new_df.index)
print(q1_arr_df.shape,q2_arr_df.shape)

In [45]:
questions_vectors_df = pd.concat([q1_arr_df,q2_arr_df],axis=1)
questions_vectors_df.shape

In [46]:
questions_vectors_df.head()

## TFIDF

In [47]:
tfidf_cv = TfidfVectorizer(max_features=3000,stop_words='english')
tf_q1_arr, tf_q2_arr = np.vsplit(tfidf_cv.fit_transform(questions).toarray(),2)

In [48]:
tf_q1_arr_df = pd.DataFrame(tf_q1_arr, index = new_df.index)
tf_q2_arr_df = pd.DataFrame(tf_q2_arr, index = new_df.index)
print(tf_q1_arr_df.shape,tf_q2_arr_df.shape)

In [49]:
questions_tf_vectors_df = pd.concat([tf_q1_arr_df,tf_q2_arr_df],axis=1)
questions_tf_vectors_df.shape

In [50]:
questions_tf_vectors_df.head()

## Drop Unnecessary Features and Concatinate the data

In [51]:
final_df = new_df.drop(columns=['id','qid1','qid2','lemma_q1','lemma_q2'])
print(final_df.shape)
final_df.head()

In [52]:
tfidf_final_df = pd.concat([final_df, questions_tf_vectors_df], axis=1)
count_final_df = pd.concat([final_df, questions_vectors_df], axis=1)

In [53]:
tfidf_final_df.head()

In [54]:
X = tfidf_final_df.iloc[:,1:]
y = tfidf_final_df.iloc[:,0]

## Model Building

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [58]:
X.shape

In [60]:
y.shape

In [61]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

## Random Forest 

In [63]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
np.mean(cross_val_score(rf,X_train,y_train,scoring = 'accuracy', cv= 5))

In [64]:
rf.score(X_test,y_test)

In [None]:
y_pred = rf.predict(X_test)

In [70]:
print("Confusion Matrix:")
print(confusion_matrix(y_test,y_pred))

print("\nClassification Report")
print(classification_report(y_test,y_pred))

## Predictions

In [71]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return len(w1 & w2)

In [72]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return (len(w1) + len(w2))

In [73]:
def preprocessing(q1,q2):
    
    input_data = []
    
    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)
    
    # fetch basic features
    input_data.append(len(q1))
    input_data.append(len(q2))
    
    input_data.append(len(q1.split(" ")))
    input_data.append(len(q2.split(" ")))
    
    input_data.append(test_common_words(q1,q2))
    input_data.append(test_total_words(q1,q2))
    input_data.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))
    
    # bow feature for q1
    q1_bow = cv.transform([q1]).toarray()
    
    # bow feature for q2
    q2_bow = cv.transform([q2]).toarray()
    
    
    
    return np.hstack((np.array(input_data).reshape(1,7),q1_bow,q2_bow))
    
    
    

In [74]:
q1 = 'Where is the capital of India?'
q2 = 'How to make money from online?'
q3 = 'Which city serves as the capital of India?'
q4 = 'What is the business capital of India?'

In [75]:
rf.predict(preprocessing(q1,q4))

In [78]:
import pickle

pickle.dump(rf,open('model.pkl','wb'))
pickle.dump(tfidf_cv,open('tfidf_cv.pkl','wb'))