In [1]:
%pip install --upgrade --quiet pinecone pinecone-text pinecone-notebooks

Note: you may need to restart the kernel to use updated packages.


In [None]:
api_key="your_api_key"

In [3]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [4]:
import os
from pinecone import Pinecone, ServerlessSpec
index_name="hybrid-search-langchain-pinecone"

# initialize pinecone client 
pc=Pinecone(api_key=api_key)

# create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, #dimension of dense vector
        metric="dotproduct", #sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [5]:
index=pc.Index(index_name)
index

<pinecone.data.index.Index at 0x201ced50320>

In [7]:
# vector embedding and sparse matrix
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


model.safetensors:  23%|##3       | 21.0M/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
from pinecone_text.sparse import BM25Encoder
bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2018cc3ec00>

In [10]:
sentences=[
    "The cat jumped over the fence.",
    "She found a hidden note under the table.",
    "The sky turned orange at sunset."
]

#tf-idf values on these sentence
bm25_encoder.fit(sentences)
#store value to a json file
bm25_encoder.dump("bm25_values.json")

#load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)

In [12]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000002018BD99E50>, index=<pinecone.data.index.Index object at 0x00000201CED50320>)

In [14]:
retriever.add_texts(
    [
        "The cat jumped over the fence.",
    "She found a hidden note under the table.",
    "The sky turned orange at sunset."
    ]
)

  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
retriever.invoke("What color did the sky turn at sunset?")

[Document(metadata={'score': 0.560034871}, page_content='The sky turned orange at sunset.'),
 Document(metadata={'score': 0.0373490602}, page_content='She found a hidden note under the table.'),
 Document(metadata={'score': 0.0180952363}, page_content='The cat jumped over the fence.')]