In [5]:
import os
from dotenv import load_dotenv
from langchain_community.retrievers import PineconeHybridSearchRetriever
load_dotenv()

api_key = os.getenv('api_key')

In [6]:
from pinecone import ServerlessSpec, Pinecone
index_name = 'hybrid-search-langchain-pinecone'
## Initialize the pinecone client
pc = Pinecone(api_key = api_key)

# create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension = 384, ## dimension of dense vector
        metric = 'dotproduct', ## sparse values supported only for dotproduct
        spec = ServerlessSpec(cloud = 'aws', region = 'us-east-1')
    )

In [7]:
index = pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x767d49f35d90>

In [10]:
## Vector embeddings and sparse matrix
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package stopwords to /home/keshav-
[nltk_data]     sharma/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x767bfde9bda0>

In [12]:
sentences = [
    'In 2023, I visited paris',
    'in 2024, i visited new zealand',
    'in 2025 i am going to visit uttar pradesh'
]

## tfidf values on these sentences
bm25_encoder.fit(sentences)

## store the values to a json file
bm25_encoder.dump('bm25_values.json')

## load to your bm25 encoder object
bm25_encoder = BM25Encoder().load('bm25_values.json')

100%|██████████| 3/3 [00:00<00:00, 80.78it/s]


In [13]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index = index)

In [14]:
retriever.add_texts([
    'In 2023, I visited paris',
    'in 2024, i visited new zealand',
    'in 2025 i am going to visit uttar pradesh'
])

100%|██████████| 1/1 [00:02<00:00,  2.08s/it]


In [15]:
retriever.invoke('what city did i visit last')

[Document(metadata={'score': 0.145863548}, page_content='in 2025 i am going to visit uttar pradesh'),
 Document(metadata={'score': 0.23627761}, page_content='In 2023, I visited paris'),
 Document(metadata={'score': 0.207164511}, page_content='in 2024, i visited new zealand')]