In [1]:
! pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_community.retrievers import PineconeHybridSearchRetriever


In [4]:
from pinecone import Pinecone,ServerlessSpec

index_name='hybrid-search-langchain-pinecone'

pc=Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

if index_name not in pc.list_indexes():
    pc.create_index(name=index_name,dimension=384,metric='dotproduct',spec=ServerlessSpec(
        cloud='aws',region='us-east-1'
    ))

In [5]:
index=pc.Index(index_name)

In [6]:
index

<pinecone.data.index.Index at 0x1ecf51fd6f0>

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [21]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1ecb11bada0>

In [22]:
sentences = [
    "Paris is renowned for its art, fashion, and iconic landmarks like the Eiffel Tower.",
    "Germany is known for its rich history, technological innovations, and vibrant cultural festivals.",
    "Boho style is characterized by its free-spirited, eclectic mix of patterns, textures, and colors."
]


In [23]:
bm25_encoder.fit(sentences
            )
bm25_encoder.dump("bm25_values.json")

bm25_encoder=BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 125.99it/s]


In [24]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,index=index,sparse_encoder=bm25_encoder)

In [25]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:03<00:00,  3.41s/it]


In [27]:
retriever.invoke('which country is known for its art?')

[Document(page_content='Germany is known for its rich history, technological innovations, and vibrant cultural festivals.'),
 Document(page_content='Paris is renowned for its art, fashion, and iconic landmarks like the Eiffel Tower.'),
 Document(page_content='Boho style is characterized by its free-spirited, eclectic mix of patterns, textures, and colors.')]