Import Some Necessary Library

In [1]:
import requests
from io import StringIO
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

Import Text data on Which We will find Similarities

In [2]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
# create dataframe
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [3]:
data.shape

(4500, 5)

In [4]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [5]:
len(sentences)

4500

In [6]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have around 4.5K unique sentences

4802

In [8]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [10]:
# each of these dataset have the same structure, so we loop through each creating our sentences data
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list 
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

In [11]:
len(set(sentences))

14505

In [12]:
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [13]:
len(sentences)

14504

BERT Model for Making Embedding of Our Txt data

In [15]:

# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

In [16]:
sentence_embeddings.shape

(14504, 768)

In [17]:
dim = sentence_embeddings.shape[1]
dim

768

Utilizing IndexFlatL2 for similarity Calculation

In [18]:
index = faiss.IndexFlatL2(dim)

In [19]:
index.is_trained # no need to train index

True

In [20]:
index.add(sentence_embeddings)

In [21]:
index.ntotal

14504

In [22]:
k = 4 # find nearest 4 vector 
xq = model.encode(["boys are playing outdoors"])

In [25]:
%%time
D, Idx = index.search(xq, k)  # search
print(Idx)

[[13272  3284  6170 10141]]
CPU times: total: 15.6 ms
Wall time: 7.27 ms


In [26]:
for i in Idx[0].tolist():
    print(sentences[i])

The young boys are playing outdoors and the man is smiling nearby
The kids are playing outdoors near a man with a smile
A group of boys in a yard is playing and a man is standing in the background
Two little boys are playing on a playground.


In [31]:
# we have 4 vectors to return (k) - so we initialize a zero array to hold them
vecs = np.zeros((k, dim))
# then iterate through each ID from Idx and add the reconstructed vector to our zero-array
for i, val in enumerate(Idx[0].tolist()):
    vecs[i, :] = index.reconstruct(val)

In [32]:
vecs.shape

(4, 768)