In [2]:
import pandas as pd
pd.set_option('display.max_colwidth',100)

In [3]:
df = pd.read_csv("sample_text.csv")
df.shape

(8, 2)

### Create Source embeddings for the text column

In [4]:
from sentence_transformers import SentenceTransformer
encoder =SentenceTransformer("all-mpnet-base-v2")
vectors=encoder.encode(df.text)
vectors.shape

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(8, 768)

In [5]:
dim=vectors.shape[1]

### Build a FAISS Index for vectors


In [6]:
import faiss
index=faiss.IndexFlatL2(dim)  #here we're creating an index
index


<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001D27BBB28B0> >

### Normalize the source vectors (as we are using L2 distance to measure similarity) and add to the index

In [7]:
index.add(vectors)  #Adding vectors to the faiss index

### Encode search text using same encorder and normalize the output vector

In [8]:
search_query="I want to buy a polo t-shirt"
#let's encode this query to a vector
vec=encoder.encode(search_query)
vec.shape

(768,)

In [9]:
# The search vector accept only 2 dimensional array so let's do it using numpy
import numpy as np
svec=np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

In [12]:
index.search(svec,k=2)

(array([[1.3844836, 1.4039094]], dtype=float32), array([[3, 2]], dtype=int64))