In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [2]:
df = pd.read_csv("chunks.csv")
df.shape

(155, 1)

In [3]:
df

Unnamed: 0,text
0,"English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\n..."
1,My Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts...
2,Chat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC...
3,Business\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\...
4,Trending Topics\n\nBusiness News LiveBudget 2025 News LiveSat Kartar Shopping Share PriceWorld B...
...,...
150,Quick Links\n\nAbout Us Contact Us Advisory Alert Advertise with Us SupportDisclaimer Privacy Po...
151,Copyright © Network18 Media & Investments Limited. All rights reserved. Reproduction of news art...
152,without express written permission of moneycontrol.com is prohibited.
153,You got 30 Day’s Trial of\n\nGrab it Now\n\nAd-Free Experience\n\nTop Articles from Financial Ti...


### Convert text to documents

In [14]:
from langchain.docstore.document import Document

documents = [Document(page_content=chunk) for chunk in df.text]

### SentenceTransformer
The SentenceTransformer class provides an easy way to load pre-trained models to convert sentences into fixed-size vector embeddings.

In [18]:
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer

# Initialize your sentence transformer model
encoder = SentenceTransformer("all-mpnet-base-v2")

# Generate embeddings for the documents
embeddings = [encoder.encode(doc.page_content) for doc in documents]

# Create embeddings for documents
faiss_index = FAISS.from_documents(documents, embeddings)


AttributeError: 'list' object has no attribute 'embed_documents'

In [5]:
vectors.shape

(155, 768)

In [6]:
dim = vectors.shape[1]
dim

768

### Index Creation and Adding Sentences
- A FAISS index is created to store sentence embeddings using L2 distance as the similarity measure.
- Precomputed sentence embeddings are added to the index for efficient searching.

In [7]:
import faiss
index = faiss.IndexFlatL2(dim)
index.add(vectors)

### Query Embedding and Reshape Query Vector
- The input query sentence is converted into a vector embedding.
- The query vector is reshaped into the correct shape for FAISS to process.

In [8]:
query = "According to the bank's filing to the stock exchanges, for how long Chakrabarti has been re-appointed?"
vec = encoder.encode(query)

import numpy as np
svec = np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

### Search
FAISS performs a nearest neighbor search to find the 2 closest sentences to the query, based on the L2 distance.

In [9]:
distances, I = index.search(svec, k=2)
distances, I

(array([[0.11929663, 0.33960286]], dtype=float32),
 array([[8, 5]], dtype=int64))

In [10]:
row_indices = I.tolist()[0]
df.loc[row_indices]

Unnamed: 0,text
8,"According to the bank's filing to the stock exchanges, Chakrabarti has been re-appointed for a p..."
5,"Chakrabarti has been appointed for a period of five years from December 14, 2023 to December 13,..."


### Store the FAISS index locally

In [11]:
import pickle
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(index, f)

In [12]:
!pip install pickle5

Collecting pickle5
  Using cached pickle5-0.0.11.tar.gz (132 kB)
Using legacy 'setup.py install' for pickle5, since package 'wheel' is not installed.
Installing collected packages: pickle5
    Running setup.py install for pickle5: started
    Running setup.py install for pickle5: finished with status 'error'


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\91967\Desktop\env + jupyter\myenv\Scripts\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\91967\\AppData\\Local\\Temp\\pip-install-p60cbx0b\\pickle5_7998c63cce6045098c815d48697ba999\\setup.py'"'"'; __file__='"'"'C:\\Users\\91967\\AppData\\Local\\Temp\\pip-install-p60cbx0b\\pickle5_7998c63cce6045098c815d48697ba999\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' install --record 'C:\Users\91967\AppData\Local\Temp\pip-record-jhicejky\install-record.txt' --single-version-externally-managed --compile --install-headers 'C:\Users\91967\Desktop\env + jupyter\myenv\include\site\python3.10\pickle5'
         cwd: C:\Users\91967\AppData\Local\Temp\pip-install-