In [1]:
!pip install rank_bm25



In [16]:

import os
from langchain.document_loaders.pdf import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOllama
from langchain.document_loaders.pdf import OnlinePDFLoader
from langchain_community.document_loaders import PyPDFLoader

Prior to constructing the retriever, we need to generate chunks from our original document.

## Making chunks
We’ll extract a PDF document from the provided URL and generate chunks using the RecursiveCharacterTextSplitter module.

In [None]:
# # for online pdfs
# url="https://s2.q4cdn.com/299287126/files/doc_financials/2023/q4/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf"

# loader = OnlinePDFLoader(url)

In [7]:
# for local pdf loader
loader = PyPDFLoader("data/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf")


In [8]:
documents = loader.load()

docuemnt_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 50, length_function = len)

chunks = docuemnt_splitter.split_documents(documents)

embeddings = OllamaEmbeddings(model='nomic-embed-text')

list_chunks=[x.page_content for x in chunks]

Combining these two retrievers, we initialize an ensemble_retriever, as demonstrated in the following code snippet. When creating the ensemble retriever, we have the option to specify the weight for each retriever.

In [9]:
faiss_vectorstore = FAISS.from_texts(list_chunks, embeddings)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 3})

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(list_chunks)
bm25_retriever.k = 3

## Query

We’ve configured k=3 to retrieve the top 3 documents relevant to the query. Therefore, both the dense and sparse retrievers fetch 3 documents each. The EnsembleRetriever queries multiple retrievers at runtime and combines their separate ranked lists of results using an algorithm like Reciprocal Rank Fusion, scoring documents based on their ranks across the individual lists. This allows it to leverage different retrieval algorithms without needing to create a unified index or store documents in a single structure.

In [10]:
query='As of December 31st 2023, what is the leased square footage for physical stores in North America?'

In [11]:
retrieved_keywords=bm25_retriever.get_relevant_documents(query)
retrieved_vectors=faiss_retriever.get_relevant_documents(query)

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                       weights=[0.5, 0.5])
retrieved_hybrid=ensemble_retriever.get_relevant_documents(query)

  warn_deprecated(


In [12]:
retrieved_keywords

[Document(page_content='Table of Contents\nItem 2. Properties\nAs of December 31, 2023, we operated the following facilities (in thousands):\nDescription of UseLeased Square\nFootage (1) Owned Square Footage Location\nOffice space 29,655 9,222North America\nOffice space 24,528 1,802International\nPhysical stores (2) 22,871 707North America\nPhysical stores (2) 255 —International\nFulfillment, data centers, and other 413,017 25,630North America\nFulfillment, data centers, and other 173,765 14,802International\nTotal 664,091 52,163\n ___________________\n(1)For leased properties, represents the total leased space excluding sub-leased space.\n(2)This includes 600 North America and 28 International stores as of December 31, 2023.\nSegmentLeased Square Footage\n(1)Owned Square Footage\n(1)\nNorth America 424,145 15,438\nInternational 165,329 7,931\nAWS 20,434 17,770\nTotal 609,908 41,139\n ___________________'),
 Document(page_content='Table of Contents\nForeign Exchange Risk\nDuring 2023, 

In [13]:
retrieved_vectors

[Document(page_content='Table of Contents\nItem 2. Properties\nAs of December 31, 2023, we operated the following facilities (in thousands):\nDescription of UseLeased Square\nFootage (1) Owned Square Footage Location\nOffice space 29,655 9,222North America\nOffice space 24,528 1,802International\nPhysical stores (2) 22,871 707North America\nPhysical stores (2) 255 —International\nFulfillment, data centers, and other 413,017 25,630North America\nFulfillment, data centers, and other 173,765 14,802International\nTotal 664,091 52,163\n ___________________\n(1)For leased properties, represents the total leased space excluding sub-leased space.\n(2)This includes 600 North America and 28 International stores as of December 31, 2023.\nSegmentLeased Square Footage\n(1)Owned Square Footage\n(1)\nNorth America 424,145 15,438\nInternational 165,329 7,931\nAWS 20,434 17,770\nTotal 609,908 41,139\n ___________________'),
 Document(page_content='Year Ended December 31,\n2021 2022 2023\nMarketable equ

In [14]:
retrieved_hybrid

[Document(page_content='Table of Contents\nItem 2. Properties\nAs of December 31, 2023, we operated the following facilities (in thousands):\nDescription of UseLeased Square\nFootage (1) Owned Square Footage Location\nOffice space 29,655 9,222North America\nOffice space 24,528 1,802International\nPhysical stores (2) 22,871 707North America\nPhysical stores (2) 255 —International\nFulfillment, data centers, and other 413,017 25,630North America\nFulfillment, data centers, and other 173,765 14,802International\nTotal 664,091 52,163\n ___________________\n(1)For leased properties, represents the total leased space excluding sub-leased space.\n(2)This includes 600 North America and 28 International stores as of December 31, 2023.\nSegmentLeased Square Footage\n(1)Owned Square Footage\n(1)\nNorth America 424,145 15,438\nInternational 165,329 7,931\nAWS 20,434 17,770\nTotal 609,908 41,139\n ___________________'),
 Document(page_content='Table of Contents\nForeign Exchange Risk\nDuring 2023, 

## Creating  chains using retrived vectors

In [25]:
llm = ChatOllama(model = "llama3")

In [26]:
llm.invoke('hello')

AIMessage(content="Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?", response_metadata={'model': 'llama3', 'created_at': '2024-08-14T17:11:13.832954391Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 16744887866, 'load_duration': 9622514170, 'prompt_eval_count': 11, 'prompt_eval_duration': 1253175000, 'eval_count': 26, 'eval_duration': 5729230000}, id='run-6a1db481-1453-4f75-a4a2-501ba9343550-0')

In [27]:
prompt = ChatPromptTemplate.from_template(
    """Answer the question based only on the context provided.

Context: {context}

Question: {question}"""
)


chain_ensemble = (
    RunnablePassthrough.assign(context=(lambda x: x["question"]) | ensemble_retriever)
    | prompt
    | llm
    | StrOutputParser()
)

chain_faiss = (
    RunnablePassthrough.assign(context=(lambda x: x["question"]) | faiss_retriever)
    | prompt
    | llm
    | StrOutputParser()
)

chain_bm25 = (
    RunnablePassthrough.assign(context=(lambda x: x["question"]) | bm25_retriever)
    | prompt
    | llm
    | StrOutputParser()
)


In [28]:

chain_ensemble.invoke({'question':query})


'Based on the provided context, as of December 31st 2023, the leased square footage for physical stores in North America is 22,871.'

In [29]:
chain_faiss.invoke({'question':query})

'According to the provided table, as of December 31st, 2023, the leased square footage for physical stores in North America is 22,871 thousand.'

In [30]:
chain_bm25.invoke({'question':query})

'Based on the provided context, the answer is:\n\n22,871 thousands'

## Qdrant use

Using Qdrant Vector Database

We’ll utilize the Langchain framework and the Pinecone vector database to build the hybrid RAG application. The initial step involves creating an index in the Pinecone vector database.

As we are adopting a hybrid search approach, currently, it exclusively supports the “dotproduct” metric. We have the option to select the dimensionality of the embedded vector. Initially, we create an index and then proceed to upload embedded dense vectors into the index.

In [39]:
import time
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm.auto import tqdm
from qdrant_client.http.models import VectorParams
from qdrant_client.http.models import PointStruct

In [44]:
index_name = "langchain-pinecone-hybrid-search-index"

In [45]:
# Initialize Qdrant client
client = QdrantClient("http://localhost:6333")

# Check if the collection already exists
existing_collections = client.get_collections().collections
existing_collection_names = [collection.name for collection in existing_collections]

In [46]:
# If the collection does not exist, create it
if index_name not in existing_collection_names:
    client.create_collection(
        collection_name=index_name,
        vectors_config=VectorParams(size=768, distance="Dot"),
    )

In [47]:
# Extract text from the Document objects
texts = [doc.page_content for doc in chunks]

# Generate embeddings for the extracted text
vectors = embeddings.embed_documents(texts)

# Prepare points for insertion
points = [
    PointStruct(id=i, vector=vector, payload={"text": text})
    for i, (text, vector) in enumerate(zip(texts, vectors))
]

In [49]:
# Insert points into Qdrant collection
client.upsert(collection_name=index_name, points=points)

# Use LangChain's Qdrant wrapper to connect to the collection
vector_store = Qdrant(client=client, collection_name=index_name, embeddings=embeddings)

We additionally generate a sparse vector using the BM25Encoder and store it in a JSON file. By combining these two types of vectors, we initialize the hybrid search retriever as demonstrated below:

In [None]:
!pip install fastembed


In [57]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode

In [60]:
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

stopwords.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

#### Hybrid Vector Search
To perform a hybrid search using dense and sparse vectors with score fusion,

The retrieval_mode parameter should be set to RetrievalMode.HYBRID.
A dense embeddings value should be provided to the embedding parameter.
An implementation of the SparseEmbeddings interface using any sparse embeddings provider has to be provided as value to the sparse_embedding parameter.

In [61]:
from langchain_qdrant import QdrantVectorStore

In [69]:
# Convert PointStruct objects to Document objects
documents = [
    Document(page_content=point.payload["text"])
    for point in points
]

In [70]:
qdrant = QdrantVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.HYBRID,
)

In [71]:
retriever_qdrant = qdrant.similarity_search("As of December 31st 2023, what is the leased square footage for physical stores in North America?")

In [72]:
retriever_qdrant

[Document(page_content='Table of Contents\nItem 2. Properties\nAs of December 31, 2023, we operated the following facilities (in thousands):\nDescription of UseLeased Square\nFootage (1) Owned Square Footage Location\nOffice space 29,655 9,222North America\nOffice space 24,528 1,802International\nPhysical stores (2) 22,871 707North America\nPhysical stores (2) 255 —International\nFulfillment, data centers, and other 413,017 25,630North America\nFulfillment, data centers, and other 173,765 14,802International\nTotal 664,091 52,163\n ___________________\n(1)For leased properties, represents the total leased space excluding sub-leased space.\n(2)This includes 600 North America and 28 International stores as of December 31, 2023.\nSegmentLeased Square Footage\n(1)Owned Square Footage\n(1)\nNorth America 424,145 15,438\nInternational 165,329 7,931\nAWS 20,434 17,770\nTotal 609,908 41,139\n ___________________', metadata={'_id': '8eb8152c210d481bac363d7f6bb998ea', '_collection_name': 'my_doc

### Qdrant Sparse vector
Qdrant is an open-source, high-performance vector search engine/database.

QdrantSparseVectorRetriever uses sparse vectors introduced in Qdrant v1.7.0 for document retrieval.

In [73]:
client = QdrantClient(location=":memory:")
collection_name = "sparse_collection"
vector_name = "sparse_vector"

client.create_collection(
    collection_name,
    vectors_config={},
    sparse_vectors_config={
        vector_name: models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [74]:
from langchain_community.retrievers import (
    QdrantSparseVectorRetriever,
)
from langchain_core.documents import Document

In [75]:
import random


def demo_encoder(_: str) -> tuple[list[int], list[float]]:
    return (
        sorted(random.sample(range(100), 100)),
        [random.uniform(0.1, 1.0) for _ in range(100)],
    )


# Create a retriever with a demo encoder
retriever = QdrantSparseVectorRetriever(
    client=client,
    collection_name=collection_name,
    sparse_vector_name=vector_name,
    sparse_encoder=demo_encoder,
)

In [None]:
retriever.add_documents(documents)

In [77]:
retriever.invoke(
    "As of December 31st 2023, what is the leased square footage for physical stores in North America?",
)

[Document(page_content='operation of its stores including its fulfillment network, Amazon’s acquisitions, and certain aspects of AWS’s offering of cloud services. We strongly dispute\nthese claims and intend to defend ourselves vigorously in these investigations. Similarly, we face investigations under a growing patchwork of laws and\nregulations governing the collection, use, and disclosure of data, the interpretation of which continues to evolve, leading to uncertainty about how regulators\nwill view our privacy practices. In addition, regulators and lawmakers are increasingly focused on controlling additional aspects of the operations of\ntechnology companies and companies they have characterized to be online “gatekeepers” through the application of existing regulations and laws and the\nadoption of new regulations and laws, which increases our compliance costs and limits the operation of our business. Unfavorable regulations, laws, decisions,', metadata={'_id': '3949b8a249ec416bbc2

#### Create a get relevant document function

Let’s adjust the _get_relevant_documents function and proceed.

In [82]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents.base import Document

def get_relevant_documents(query, qdrant: QdrantVectorStore):
    # Encode the query using the dense embedding model
    # dense_vec = embeddings.embed_query(query)

    # Perform the dense similarity search in Qdrant
    results = qdrant.similarity_search(
        query=query,
        k=4  # Number of top results to return
    )

    # Convert the results to a list of Document objects
    final_result = []
    for result in results:
        final_result.append(
            Document(page_content=result.page_content, metadata=result.metadata)
        )

    return final_result


In [83]:
final_result=get_relevant_documents(query, qdrant)
final_result

[Document(page_content='Table of Contents\nItem 2. Properties\nAs of December 31, 2023, we operated the following facilities (in thousands):\nDescription of UseLeased Square\nFootage (1) Owned Square Footage Location\nOffice space 29,655 9,222North America\nOffice space 24,528 1,802International\nPhysical stores (2) 22,871 707North America\nPhysical stores (2) 255 —International\nFulfillment, data centers, and other 413,017 25,630North America\nFulfillment, data centers, and other 173,765 14,802International\nTotal 664,091 52,163\n ___________________\n(1)For leased properties, represents the total leased space excluding sub-leased space.\n(2)This includes 600 North America and 28 International stores as of December 31, 2023.\nSegmentLeased Square Footage\n(1)Owned Square Footage\n(1)\nNorth America 424,145 15,438\nInternational 165,329 7,931\nAWS 20,434 17,770\nTotal 609,908 41,139\n ___________________', metadata={'_id': '8eb8152c210d481bac363d7f6bb998ea', '_collection_name': 'my_doc

### Creating chain and final response

In [84]:
chain_new = (
    RunnablePassthrough.assign(context=lambda x: final_result)
    | prompt
    | llm
    | StrOutputParser()
)
chain_new.invoke({"question":query})

'Based on the provided context, the answer to the question is:\n\nAs of December 31st, 2023, the leased square footage for physical stores in North America is 22,871.'