In [None]:
!pip install faiss
!pip install faiss-cpu
!pip install sentence_transformers
!pip install --user -U nltk

In [None]:
import numpy as np
import faiss
import requests
from io import StringIO
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from datetime import datetime
from sentence_transformers import SentenceTransformer
import os
import nltk
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
res = requests.get("https://drive.google.com/uc?export=download&id=1OMHe8kBowS6NsjdxEADkB2wHhbaAaad4")

text = res.text
text[:100]

'id,label,concept\n0,accident,accident\n1,death,death\n2,disease,disease\n3,pneumonium,pneumonium\n4,cance'

In [None]:
data = pd.read_csv(StringIO(text), sep=',')
data.head()

Unnamed: 0,id,label,concept
0,0,accident,accident
1,1,death,death
2,2,disease,disease
3,3,pneumonium,pneumonium
4,4,cancer,cancer


In [None]:
sentences = data['concept'].tolist()
sentences[:10]

['accident',
 'death',
 'disease',
 'pneumonium',
 'cancer',
 'heart attack',
 'illnes',
 'stroke',
 'complication',
 'infection']

In [None]:
sentences_id = data['id'].tolist()
sentences_id[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
sentences = [
    sentence.replace('\n', '') for sentence in list(set(sentences)) if type(sentence) is str
    ]

In [None]:
with open('backup_sentences.txt', 'w') as fp:
    fp.write('\n'.join(sentences))

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

(91212, 768)

In [None]:
sentence_embeddings.shape[0]

91212

In [None]:
with open(f'./sim_sentences/embeddings_X.npy', 'wb') as fp:
    np.save(fp, sentence_embeddings[0:256])

In [None]:
split = 256
file_count = 0
for i in range(0, sentence_embeddings.shape[0], split):
    end = i + split
    if end > sentence_embeddings.shape[0] + 1:
        end = sentence_embeddings.shape[0] + 1
    file_count = '0' + str(file_count) if file_count < 0 else str(file_count)
    with open(f'./sim_sentences/embeddings_{file_count}.npy', 'wb') as fp:
        np.save(fp, sentence_embeddings[i:end, :])
    print(f"embeddings_{file_count}.npy | {i} -> {end}")
    file_count = int(file_count) + 1

embeddings_0.npy | 0 -> 256
embeddings_1.npy | 256 -> 512
embeddings_2.npy | 512 -> 768
embeddings_3.npy | 768 -> 1024
embeddings_4.npy | 1024 -> 1280
embeddings_5.npy | 1280 -> 1536
embeddings_6.npy | 1536 -> 1792
embeddings_7.npy | 1792 -> 2048
embeddings_8.npy | 2048 -> 2304
embeddings_9.npy | 2304 -> 2560
embeddings_10.npy | 2560 -> 2816
embeddings_11.npy | 2816 -> 3072
embeddings_12.npy | 3072 -> 3328
embeddings_13.npy | 3328 -> 3584
embeddings_14.npy | 3584 -> 3840
embeddings_15.npy | 3840 -> 4096
embeddings_16.npy | 4096 -> 4352
embeddings_17.npy | 4352 -> 4608
embeddings_18.npy | 4608 -> 4864
embeddings_19.npy | 4864 -> 5120
embeddings_20.npy | 5120 -> 5376
embeddings_21.npy | 5376 -> 5632
embeddings_22.npy | 5632 -> 5888
embeddings_23.npy | 5888 -> 6144
embeddings_24.npy | 6144 -> 6400
embeddings_25.npy | 6400 -> 6656
embeddings_26.npy | 6656 -> 6912
embeddings_27.npy | 6912 -> 7168
embeddings_28.npy | 7168 -> 7424
embeddings_29.npy | 7424 -> 7680
embeddings_30.npy | 7680 -> 7

In [None]:
d = sentence_embeddings.shape[1]
d

768

In [None]:
nlist = 50
quantizer = faiss.IndexFlatL2(d)

In [None]:
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [None]:
index.is_trained

False

In [None]:
index.train(sentence_embeddings)
index.is_trained

True

In [None]:
index.add(sentence_embeddings)
index.ntotal

91212

In [None]:
k = 10
xq = model.encode(["O Brasil ser/u00e1 beneficiado com a privatiza"])
index.nprobe = 10

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[24122 75470 87452 25139 29079 14408 45060 25980 86168 72598]]
CPU times: user 28.1 ms, sys: 3 µs, total: 28.1 ms
Wall time: 27.7 ms


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['24122: p brasiliensi',
 '75470: leishmanium braziliensi',
 '87452: l braziliensi',
 '25139: antus austerity movement in portugal',
 '29079: fungu thielaviopsi basicola',
 '14408: order in guinea bissau',
 '45060: soyouwanna know about term limit',
 '25980: country that introduce a ubi',
 '86168: revolu o do cravo 25 de abril',
 '72598: ipecacuanha']

In [None]:
mapp = I[0][0]
print(I[0][0])

24122


In [None]:
index.make_direct_map()

In [None]:
index.reconstruct(int(mapp))[:100]

array([-0.6178972 ,  0.2739888 ,  0.95570797,  0.29745072,  1.0419716 ,
       -0.36132774, -0.65958786, -0.17318268,  0.2676007 , -0.6550233 ,
        0.09897888,  0.697901  ,  0.4475108 ,  0.2226116 ,  1.3593298 ,
        0.21485245, -0.8364165 ,  0.6573222 , -0.5078756 , -0.3931266 ,
       -0.72877806,  0.84977466,  0.46568248, -0.4432108 ,  0.18579258,
       -0.41736174,  0.5435748 , -1.3089697 , -0.33596385,  0.02303894,
        0.10066048, -0.03405455, -0.00854224,  0.25538886, -0.156918  ,
        0.1346851 , -0.65522367, -0.1580202 ,  0.12009104, -0.6642633 ,
        0.55352855,  0.10845656,  1.2956092 ,  0.47388133,  0.08815172,
       -0.3254678 ,  0.65858364, -0.09248141, -0.41042158, -1.0980366 ,
       -0.5551447 , -0.15574197,  0.21591763,  0.6692919 , -0.6626479 ,
        0.22872925,  0.5465387 , -0.44054088, -0.5476281 , -0.9144721 ,
        0.61702687, -0.35405967,  0.22393756,  0.9222913 ,  0.14387825,
        0.03612142, -0.13033427,  0.1845754 ,  0.22886299,  0.40

In [None]:
resx = requests.get("https://drive.google.com/uc?export=download&id=1k_jawxvglg7odjV75fJY2L5O8rt0g0Ho")

textx = resx.text
textx[:100]

'id,label,concept\n74356,O Brasil ser/u00e1 beneficiado com a privatiza/u00e7/u00e3o das administra/u0'

In [None]:
datad = pd.read_csv(StringIO(textx), sep=',')
datad.head()

Unnamed: 0,id,label,concept
0,74356,O Brasil ser/u00e1 beneficiado com a privatiza...,O Brasil ser/u00e1 beneficiado com a privatiza...
1,74357,Sensitive social and political topics should b...,Sensitive social and political topics should b...
2,74358,Organ donation,Organ donation
3,74359,search bitter //(maybe// ) truth than in lie t...,search bitter //(maybe// ) truth than in lie t...
4,74360,Option /One/:/nAgree and then honor the agreem...,Option /One/:/nAgree and then honor the agreem...


In [None]:
sentence_b = datad['concept'].tolist()
len(sentence_b)

3126

In [None]:
sentence_b[:5]

['O Brasil ser/u00e1 beneficiado com a privatiza/u00e7/u00e3o das administra/u00e7/u00f5es de seus principais portos?',
 'Sensitive social and political topics should be discussed in school.',
 'Organ donation',
 'search bitter //(maybe// ) truth than in lie to make and more',
 'Option /One/:/nAgree and then honor the agreement . Become police .']

In [None]:
sentence_b_id = datad['id'].tolist()
len(sentence_b_id)

3126

In [None]:
sentence_b_id[:5]

[74356, 74357, 74358, 74359, 74360]

In [None]:
if os.path.isfile("resultSim.txt"):
 os.remove("resultSim.txt")
with open("resultSim.txt", 'a') as fpx:
    fpx.write('id,input,matched,sim_index,similarity')
maxq = 0.4
max = 0.00
max_sen = "random"
max_input = "random input"
max_per = "rand"
for x in range(len(set(sentence_b))):
  k = 4
  xq = model.encode([sentence_b[x]])
  D, I = index.search(xq, k)  # search
  X = sentences[I[0][0]]
  Y = sentence_b[x]
  X_list = word_tokenize(X)  
  Y_list = word_tokenize(Y) 
  sw = stopwords.words('english')  
  l1 =[];l2 =[] 
  X_set = {w for w in X_list if not w in sw}  
  Y_set = {w for w in Y_list if not w in sw} 
  rvector = X_set.union(Y_set)  
  for w in rvector: 
      if w in X_set: l1.append(1)
      else: l1.append(0) 
      if w in Y_set: l2.append(1) 
      else: l2.append(0) 
  c = 0
  for i in range(len(rvector)): 
          c+= l1[i]*l2[i] 
  if float((sum(l1)*sum(l2))**0.5) > 0:
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
  else:
    cosine = c / 0.0000001 
  per = float(cosine*100)
  if cosine > float(maxq):
   f = open("resultSim.txt", 'a')
   f.write("\n")
   f.write(str(x))
   f.write(",")
   f.write(Y)
   f.write(",")
   f.write(X)
   f.write(",")
   f.write("{:.2f}".format(cosine))
   f.write(",")
   f.write("{:.2f}".format(per))
   f.write("%")
   max = "{:.2f}".format(cosine)
   max_sen = X
   max_input = Y
   max_per = "{:.2f}".format(per)
   print("\n---------------------New Match Record---------------------\ninput\tmatched\tsim_index\tsimilarity","\n",max_input,"\t",max_sen,"\t",max,"\t",max_per,"%")
print("---------------------Final Match Record---------------------\ninput\tmatched\tsim_index\tsimilarity","\n",max_input,"\t",max_sen,"\t",max,"\t",max_per,"%")