In [None]:
!pip install --upgrade pip
!pip install tensorflow
!pip install tensorflow-hub
!pip install faiss-cpu

In [1]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.11.0-cp310-cp310-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Using cached tensorflow_intel-2.11.0-cp310-cp310-win_amd64.whl (266.3 MB)
Installing collected packages: tensorflow-intel, tensorflow
Successfully installed tensorflow-2.11.0 tensorflow-intel-2.11.0




In [3]:
import tensorflow_hub as hub
import numpy as np
import os

In [4]:
# Load pre-trained universal sentence encoder model
encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


## Use case 1:  Compare a given sentence to a list of sentence.

In [31]:
## The output is a matrix of dimension 5*512. (each sentence is a vector of size 512). Since the values are normalized, the inner product of encodings can be treated as a similarity matrix.
## When inner product is > 0.8, two sentences might be similar
## This can be used to classify if a given sentence is similar to a list of sentence
# list of known greeting phase store in the index
greets = ["What's up?",
         'It is a pleasure to meet you.',
         'How do you do?',
         'Top of the morning to you!',
         'Hi',
         'How are you doing?',
         'Hello',
         'Greetings!',
         'Hi, How is it going?',
         'Hi, nice to meet you.',
         'Nice to meet you.']

greet_matrix = encoder(greets)

test_text = "Hello, How are you doing"

test_embed = encoder([test_text])

similarity_matrix  = np.inner(test_embed, greet_matrix)

print(similarity_matrix)

if similarity_matrix.max() > 0.8:
    
    print("it is a greetings")
    
    idx_max = similarity_matrix.argmax(axis=1)
    
    print(f"The most similar pharse is {greets[idx_max[0]]}")
else:
    
    print("it is not a greetings")

[[0.6636699  0.37653524 0.402884   0.22991467 0.48928195 0.9062629
  0.51853967 0.42576593 0.7364635  0.5398003  0.41996193]]
it is a greetings
The most similar pharse is How are you doing?


# Use case 2: filter out duplicate sentence

In [32]:
sentences = [
    "How old are you",
    "What is your age",
    "I love to watch Television",
    "I am wearing a wrist watch",
    "I am wearing a wrist watch"
]
# use inner product if the metrix it self.
similarity_matrix  = np.inner(vectors, vectors)
print(similarity_matrix)
print(similarity_matrix.shape)

def redundant_sent_idx(sim_matrix):
    dup_idx = [] 
    for i in range(sim_matrix.shape[0]):
        if i not in dup_idx:
            tmp = [t+i+1 for t in list(np.where( sim_matrix[i][i+1:] > 0.8 )[0])]
            print(tmp)
            dup_idx.extend(tmp)
    return dup_idx
#indexes of duplicate statements.
dup_indexes  = redundant_sent_idx(similarity_matrix)

unique_messages = np.delete(np.array(sentences), dup_indexes)
unique_messages


[[1.         0.8015871  0.07059961 0.09362547 0.09362547]
 [0.8015871  0.9999999  0.10191173 0.17093717 0.17093718]
 [0.07059961 0.10191173 1.         0.37186104 0.3718611 ]
 [0.09362547 0.17093717 0.37186104 0.9999999  1.        ]
 [0.09362547 0.17093718 0.3718611  1.         1.        ]]
(5, 5)
[1]
[]
[4]


array(['How old are you', 'I love to watch Television',
       'I am wearing a wrist watch'], dtype='<U26')

## Use case3: Semantic Search

In [33]:
# Sentences for which you want to create embeddings,
# passed as an array in embed()
sentences = [
    "How old are you",
    "What is your age",
    "I love to watch Television",
    "I am wearing a wrist watch",
    "I am wearing a wrist watch"
]

vectors = encoder(sentences) # the order relative to the sentence will be maintained
  
# Printing embeddings of each sentence
print(vectors)

tf.Tensor(
[[-0.06045129 -0.00204539  0.02656927 ...  0.00764414 -0.02669661
   0.05110301]
 [-0.08415683 -0.08687922  0.03446118 ... -0.01439385 -0.04546221
   0.03639965]
 [ 0.0816019  -0.01570279 -0.05659246 ... -0.07133697  0.11040761
  -0.00710947]
 [-0.00369538  0.03064634 -0.05556112 ...  0.01751422  0.03164959
  -0.05139377]
 [-0.00369538  0.03064634 -0.05556112 ...  0.01751422  0.03164959
  -0.05139377]], shape=(5, 512), dtype=float32)


In [48]:
import faiss
import time

# Create an index using FAISS
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)
faiss.write_index(index, 'search_index')
index = faiss.read_index('search_index')

In [49]:
def search(query, k):
    
    t=time.time()
    query_vector = encoder([query])
    top_k = index.search(query_vector, k) # return a tuple of distance of nearest neighor and index or nearest neighor 
    print(top_k) 
    print('totaltime: {}'.format(time.time()-t))
    return [sentences[_id] for _id in top_k[1].tolist()[0]]

search('age', 2)

(array([[1.0904288, 1.2235966]], dtype=float32), array([[1, 0]], dtype=int64))
totaltime: 0.005011796951293945


['What is your age', 'How old are you']