In [1]:
import time
import os
import itertools
import numpy as np
import pandas as pd
from IPython.display import clear_output

import pickle
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)

In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [57]:
train_path = "data/plagiarism/quora-question-pairs/train.csv"
df = pd.read_csv(train_path)

In [35]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [6]:
# encode the questions in batches of 100
batch_size = 50
q1_embs, q2_embs = [], []
length = len(df)
t1 = time.perf_counter()
for i in range(0, length, batch_size):
    batch = df.iloc[i:i+batch_size]
    print("Processing batch {}/{}".format(i//batch_size+1, (length//batch_size)+1))
    q1_embs.append(model.encode(batch['question1'].values.tolist()))
    q2_embs.append(model.encode(batch['question2'].values.tolist()))
    clear_output()
t2 = time.perf_counter()
print("Encoding time: {}".format(t2-t1))

Encoding time: 13701.7672002


In [11]:
# save the embeddings
save_obj(q1_embs, "data/plagiarism/quora-question-pairs/resluts/q1_embs")
save_obj(q2_embs, "data/plagiarism/quora-question-pairs/resluts/q2_embs")

In [46]:
q1 = []
q2 = []
for i in range(len(q1_embs)):
    q1.extend(q1_embs[i])
    q2.extend(q2_embs[i])
q1 = np.array(list(map( lambda x: x.reshape(1,-1) ,q1)))
q2 = np.array(list(map( lambda x: x.reshape(1,-1) ,q2)))

# test data

In [3]:
df_test = pd.read_csv("data/plagiarism/quora-question-pairs/test.csv")
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [4]:
# encode the questions in batches of 100
batch_size = 50
q1_embs, q2_embs = [], []
length = len(df_test)
t1 = time.perf_counter()
for i in range(0, length, batch_size):
    batch = df_test.iloc[i:i+batch_size]
    print("Processing batch {}/{}".format(i//batch_size+1, (length//batch_size)+1))
    q1_embs.append(model.encode(batch['question1'].values.tolist()))
    q2_embs.append(model.encode(batch['question2'].values.tolist()))
    clear_output()
t2 = time.perf_counter()
print("Encoding time: {}".format(t2-t1))

Processing batch 5611/46916


**280500 question is enough as it is very time consuming**

In [None]:
# save the embeddings
save_obj(q1_embs, "data/plagiarism/quora-question-pairs/resluts/q1_embs")
save_obj(q2_embs, "data/plagiarism/quora-question-pairs/resluts/q2_embs")