## Disclaimer

In [None]:
"""
This notebook is based on the work here: https://medium.com/loopio-tech/how-to-use-faiss-to-build-your-first-similarity-search-bf0f708aa772
Replicating it for the purpose of education and learning
"""

## Dependencies

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

'\nThis notebook is based on the work here: https://medium.com/loopio-tech/how-to-use-faiss-to-build-your-first-similarity-search-bf0f708aa772\nReplicating it for the purpose of education and learning\n'

### Step 1: Create a dataframe with the existing text and categories

In [5]:
data = [['Where are your headquarters located?', 'location'],
['Throw my cellphone in the water', 'random'],
['Network Access Control?', 'networking'],
['Address', 'location']]
df = pd.DataFrame(data, columns = ['text', 'category'])
display(df)

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location


### Step 2: Create vectors from the text

In [8]:
text = df['text']
encoder = SentenceTransformer("all-MiniLM-L6-v2")
vectors = encoder.encode(text)

### Step 3: Build a FAISS index from the vectors

In [9]:
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

### Step 4: Create a search vector

In [11]:
search_text = 'where is your office?'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

### Step 5: Search

In [12]:
k = index.ntotal
distances, ann = index.search(_vector, k=k)

### Step 6: Sort search results

In [14]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
display(results)

Unnamed: 0,distances,ann
0,0.575452,0
1,1.183322,3
2,1.56556,1
3,1.759201,2


### Step 7: Get category for the search text

In [16]:
merge = pd.merge(results, df, left_on='ann', right_index=True)
display(merge)

Unnamed: 0,distances,ann,text,category
0,0.575452,0,Where are your headquarters located?,location
1,1.183322,3,Address,location
2,1.56556,1,Throw my cellphone in the water,random
3,1.759201,2,Network Access Control?,networking


In [17]:
labels  = df['category']
category = labels[ann[0][0]]

In [22]:
print(f"This will give us the category of **{category}** for our search text “Where is your office?”")

This will give us the category of **location** for our search text “Where is your office?”
