In [24]:
from dotenv import load_dotenv
import os
load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API')


In [25]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [28]:
import os
from pinecone import Pinecone, ServerlessSpec
#index name should be small otherwise error
index_name = "hybrid-search-langchain-using-pinecone"
#initializing pinecone clinet
pc = Pinecone(api_key=pinecone_api_key, environment='us-east-1-aws')


In [33]:
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,  # dimension of dense vector
        metric="dotproduct",  # for sparse value creation we use dotproduct
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

Index 'hybrid-search-langchain-using-pinecone' created.


In [34]:
index = pc.Index(index_name)

In [35]:

index

<pinecone.data.index.Index at 0x7f6674538b80>

In [36]:
#create embedding for dense matrix
os.environ['HF_TOKEN'] = os.getenv("HF")
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

In [37]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [38]:
# create embedding for sparse matrix
#BM25Encoder uses TFIDF encoder
from pinecone_text.sparse import BM25Encoder
bm25_encoder = BM25Encoder().default()
bm25_encoder


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7f6674538760>

In [39]:
sentences  =[
    "where are you now",
    "i am at the highest peak of the world",
    "I am in Nepal"
]
#apply tfidf values on these sentences
bm25_encoder.fit(sentences)

#store in json file
bm25_encoder.dump("bm25_values.json")
bm25_encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 5216.80it/s]


In [40]:
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7f66787444c0>

In [41]:
#create retriever to combine both
retriever = PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index_name)

In [42]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x7f66787444c0>, index='hybrid-search-langchain-using-pinecone')

In [44]:
retriever.add_texts(["foo", "bar", "world", "hello"])



  0%|          | 0/1 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'upsert'