In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/quora-question-pairs'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Structure 

In [None]:
# Unzip the input data file
import zipfile

filezip = filenames # get a list of all file in folder quora-question-pairs

for fz in filezip:
    if fz.endswith(".zip"): # check the suffixe '.zip'
        zFile = zipfile.ZipFile(dirname + "/" + fz,"r") # Zipfile Open a ZIP file
        for fileM in zFile.namelist(): # .namelist() Return a list of archive members by name.
            zFile.extract(fileM,"/kaggle/working")# .extract Extract a member from the archive to the current working directory

In [None]:
# Load the train set 
df = pd.read_csv("/kaggle/working/train.csv")
df.shape

We can see the Quora Question consists of 404,290 raws and 6 columns.

In [None]:
# Summary statistics
df.describe()

In [None]:
# Information 
df.info()

In [None]:
df.head()

In [None]:
# Inspecting missing values 
df.isnull().sum()

In [None]:
# Remove the missing values
df = df.dropna(how='any').reset_index(drop=True)

Let's see the sample sentences :

In [None]:
# df.iloc[:10, [3,4]]
for i in range(0, 10):
    print(df['question1'][i])
    print(df['question2'][i])
    print("\n")

## Similarities Distributions 

Plot the distribution of the sentence duplicate or non duplicate : 

In [None]:
df['is_duplicate'].value_counts()

In [None]:
# calculate the proportion 

df['is_duplicate'].value_counts() / len(df) * 100 

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(4,5))
labels = ["unique", "duplicate"]
sizes = df['is_duplicate'].value_counts()
colors = ['palevioletred','dodgerblue']

plt.title("Distribution of Similar Sentences")
patches,text1,text2 = plt.pie(sizes,
                      explode=None,
                      labels=labels,
                      colors=colors,
                      autopct = '%1.2f%%', 
                      shadow = False, 
                      startangle =90, 
                      pctdistance = 0.6) 

plt.axis('equal')
plt.show()

Then, let's draw a plot to visualize the correlation of word's length and the similarities : 

In [None]:
# The length of words
df['len_w_q1'] = df['question1'].apply(lambda x : len(str(x).split()))
df['len_w_q2'] = df['question2'].apply(lambda x : len(str(x).split()))

In [None]:
# FuzzyWuzzy: Fuzzy String Matching in Python
from fuzzywuzzy import fuzz

df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [None]:
# Find common words 
df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

In [None]:
# Import plotly

import plotly.express as px
px.histogram(df, x="len_w_q1",height=700, color='is_duplicate', title="Question1 Length Distribution", marginal="box")

In [None]:
px.histogram(df, x="len_w_q2",height=700, color='is_duplicate', title="Question2 Length Distribution", marginal="box")

As we can see from the boxplot and histogram, the data is skewed to the right as the tail is longer. The distribution of similar sentences is more in the sentences of relatively short in length.

## Word Cloud

In [None]:
# Import regex
import re 

def text_cleaning(x):
    
    questions = re.sub('\s+\n+', ' ', x)
    questions = re.sub('[^a-zA-Z0-9]', ' ', questions)
    questions = questions.lower()
    
    return questions

In [None]:
# Import tqdm to show the small progress 
from tqdm import tqdm
tqdm.pandas()

df['q1_cleaned'] = df['question1'].progress_apply(text_cleaning)
df['q2_cleaned'] = df['question2'].progress_apply(text_cleaning)

In [None]:
df[['q1_cleaned', 'question1', 'q2_cleaned', 'question2']].head()

In [None]:
from wordcloud import WordCloud,STOPWORDS

question1 = df['q1_cleaned'].tolist()
question2 = df['q2_cleaned'].tolist()

In [None]:
from matplotlib import pyplot as plt

wordcloud = WordCloud(background_color="white",
                      max_words=1500,
                      stopwords=STOPWORDS,
                      random_state=42).generate(" ".join(question1))

plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Wordcloud Question1")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(background_color="white",
                      max_words=1500,
                      stopwords=STOPWORDS,
                      random_state=42).generate(" ".join(question2))

plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Wordcloud Question2")
plt.axis("off")
plt.show()

## Word2Vec modeling 

Word2Vec is a more sophisticated word embedding technique. This technique is based on the idea that words that occur in the same contexts tend to have similar meanings.   

So if two words have similar contexts, then our network is motivated to learn similar word vectors for these two words.

Simple graphic Word2Vec :

**CBOW model** & **skip gram model**

![](https://blog.acolyer.org/wp-content/uploads/2016/04/word2vec-cbow.png)

![](https://blog.acolyer.org/wp-content/uploads/2016/04/word2vec-skip-gram.png)

Ref : [The amazing power of word vectors](http://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)

We will be using word2vec pre-trained **Google News corpus**. 

In [None]:
# Import KeyedVectors

from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
#  Obtain sentence vector from word vector

import gensim
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


def sent2vec(s):
    token_list = gensim.utils.simple_preprocess(str(s).encode('utf-8'))
   
    words = [w for w in token_list if w not in stop_words]
    words = [w for w in words if w.isalpha()]
    
    words_vectors = []
    for w in words:
        try:
            words_vectors.append(model[w])
        except:
            continue 
    # Summing the resulting word vectors and divide by the total number of vectors 
    # There are 300 vectors in Google's pre-trained model
    words_vectors = np.array(words_vectors)
    add_up_vectors = words_vectors.sum(axis=0)
    sent_vectors = add_up_vectors / words_vectors.shape[0]
    return sent_vectors

In [None]:
np.seterr(all='ignore', divide='ignore')

question1_vectors = np.zeros((df.shape[0], 300))

for i, q in enumerate(tqdm(df['q1_cleaned'].values)):
    question1_vectors[i,:] = sent2vec(q)

In [None]:
print(question1_vectors.shape)
print(question1_vectors)

In [None]:
question2_vectors = np.zeros((df.shape[0], 300))

for i, q in enumerate(tqdm(df['q2_cleaned'].values)):
    question2_vectors[i,:] = sent2vec(q)

In [None]:
print(question2_vectors.shape)
print(question2_vectors)

We now obtain the vector of sentences for question1 and question2, then we compute all the distances.

In [None]:
from scipy.spatial.distance import cosine, jaccard, euclidean

In [None]:
df['cosine_distance'] = [cosine(q1, q2) for (q1, q2) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['jaccard_distance'] = [jaccard(q1, q2) for (q1, q2) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['euclidean_distance'] = [euclidean(q1, q2) for (q1, q2) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
# Removing the Nan entries or infs entries

In [None]:
df.head(10)

In [None]:
df.isnull().sum()

## Xgboost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

In [None]:
df.drop(['question1', 'question2'], axis=1, inplace=True)

In [None]:
df.drop(['q1_cleaned', 'q2_cleaned'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
X = df.loc[:, df.columns != 'is_duplicate']
y = df.loc[:, df.columns == 'is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, subsample=0.8).fit(X_train, y_train.values.ravel()) 

In [None]:
prediction = model.predict(X_test)
cm = confusion_matrix(y_test, prediction)  
print(cm)  
print('Accuracy', accuracy_score(y_test, prediction))
print(classification_report(y_test, prediction))