In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import re

## Dataset

In [2]:
train_data = pd.read_csv('../input/quora-question-pairs/train.csv.zip')
train_data.head()

In [3]:
train_data.shape

## Take Randomly 50000 Data Samples

In [4]:
sample_data = train_data.sample(50000, random_state=2)
sample_data.head()

In [5]:
sample_data.isnull().sum()

In [6]:
train_data = train_data.dropna().reset_index(drop=True)
train_data.isnull().sum()

## Data Preprocessing

In [7]:
def preprocess(data):
    
    data = str(data).lower().strip()
    
    # Replace special characters with their string edatauivalents.
    data = data.replace('%', ' percent')
    data = data.replace('$', ' dollar ')
    data = data.replace('₹', ' rupee ')
    data = data.replace('€', ' euro ')
    data = data.replace('@', ' at ')
    data = data.replace('#', '')
    data = data.replace('u.s.', 'usa')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    data = data.replace('[math]', '')
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    data_decontracted = []

    for word in data.split():
        if word in contractions:
            word = contractions[word]

        data_decontracted.append(word)

    data = ' '.join(data_decontracted)
    data = data.replace("'ve", " have")
    data = data.replace("n't", " not")
    data = data.replace("'re", " are")
    data = data.replace("'ll", " will")
    
    # Removing HTML tags
    data = BeautifulSoup(data)
    data = data.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    data = re.sub(pattern, ' ', data).strip()

    
    return data

In [8]:
sample_data['question1'] = sample_data['question1'].apply(preprocess)
sample_data['question2'] = sample_data['question2'].apply(preprocess)

In [9]:
sample_data.head()

In [10]:
sample_data.drop(['id','qid1','qid2','is_duplicate'],axis=1,inplace=True)

In [11]:
sample_data.head()

In [12]:
# questions = list(sample_data['question1'])+list(sample_data['question2'])
# len(questions)

In [13]:
new_df = sample_data.copy().reset_index(drop=True)
new_df.head()

## Tokenization

In [14]:
tokenized_question1 = new_df['question1'].apply(lambda x: x.split()) #tokenizing
tokenized_question1.head()

In [15]:
tokenized_question2 = new_df['question2'].apply(lambda x: x.split()) #tokenizing
tokenized_question2.head()

## Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmaObj = WordNetLemmatizer()

In [17]:
lemma_tokenized_q1 = tokenized_question1.apply(lambda x: [lemmaObj.lemmatize(word,pos='v') for word in x])
lemma_tokenized_q1.head()

In [18]:
lemma_tokenized_q2 = tokenized_question2.apply(lambda x: [lemmaObj.lemmatize(word,pos='v') for word in x])
lemma_tokenized_q2.head()

In [19]:
new_df['lemma_q1'] = lemma_tokenized_q1
new_df['lemma_q1'] = new_df['lemma_q1'].apply(lambda x: ' '.join(x))

In [20]:
new_df['lemma_q2'] = lemma_tokenized_q2
new_df['lemma_q2'] = new_df['lemma_q2'].apply(lambda x: ' '.join(x))

In [21]:
new_df.head()

In [22]:
lemma_questions = list(new_df['lemma_q1'])+list(new_df['lemma_q2'])
len(lemma_questions)

In [81]:
unique_questions = list(set(lemma_questions))
len(unique_questions)

## Model Building

In [23]:
pip install sentence-transformers

In [24]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted","this is an example of multiple sentences"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(unique_questions)
print(embeddings)

In [56]:
def similarity_scorer(question):
    question = preprocess(question)
    input_vectors = model.encode(question)
    similarity_score = cosine_similarity([input_vectors],embeddings)
    top_similar = sorted(list(enumerate(similarity_score[0])),reverse=True, key=lambda x:x[1])
    for i in top_similar[:10]:
        ques = lemma_questions[i[0]]
        print(ques,":",i[1])

In [57]:
question11 = ['How to earn money from online.']

In [58]:
similarity_scorer(question11)

In [32]:
import pickle

pickle.dump(model,open('model.pkl','wb'))
pickle.dump(embeddings,open('embeddings.pkl','wb'))