Combine ParentdocumentRetriever with Reranking: Rerank retrieved child chunks #21966
-
Checked other resources
Commit to Help
Example Codefrom langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.retrievers import MultiVectorRetriever
# PG Vector von Kheiri
from langchain_community.vectorstores.pgvector import PGVector
# Neues PG Vector von Langchain
#from langchain_postgres import PGVector
#from langchain_postgres.vectorstores import PGVector
import os
import shutil
import json
from langchain.load import dumps, loads
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
def build_parent_document_retriever_store(directory_path, embeddings, collection_name, connection, llm):
# this returns langchain documents, processed with unstructured-io
documents = read_preprocess_and_chunk_data(directory_path=directory_path, llm=llm)
# Check if the LocalFileStore directory exists
parent_retriever_file_store = f"vectorstore/parent_document_retriever/parent_doc_store_{collection_name}"
if os.path.exists(parent_retriever_file_store):
print(f"Clearing existing files in {parent_retriever_file_store}")
# Delete the contents of the directory
for filename in os.listdir(parent_retriever_file_store):
file_path = os.path.join(parent_retriever_file_store, filename)
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
fs = LocalFileStore(parent_retriever_file_store)
store = create_kv_docstore(fs)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.PARENT_CHUNK_SIZE)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.CHILD_CHUNK_SIZE)
vectorstore = PGVector(
embedding_function=embeddings,
collection_name=collection_name,
connection_string=connection,
use_jsonb=True,
pre_delete_collection=True
)
retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=store,
child_splitter=child_splitter,
parent_splitter=parent_splitter,
)
retriever.add_documents(documents=documents)
return retriever
# This is how I can load the created and saved ParentDocumentRetriever
def rebuild_parent_document_retriever(embeddings, COLLECTION_NAME, CONNECTION_STRING):
"""Recreate Retriever Object to be reused."""
# only do what's needed to recreate the retriever
# no need to actually load or split docs
parent_retriever_file_store = f"vectorstore/parent_document_retriever/parent_doc_store_{COLLECTION_NAME}"
vectorstore = load_PG_vectorstore(embeddings, COLLECTION_NAME, CONNECTION_STRING)
fs = LocalFileStore(parent_retriever_file_store)
store = create_kv_docstore(fs)
big_chunks_retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
)
big_chunks_retriever.search_kwargs['k'] = cfg.VECTOR_COUNT
return big_chunks_retriever DescriptionHi, I want to combine ParentDocument-Retrieval with Reranking (e.g. ColBERT). But I am not sure how I can implement this. This would be how the chunking works:
So how can I use this reranking method after retrieving the child chunks and before referencing to the parent chunks? So again, child documents should be searched, and reranked. The reranked child documents should then be passed to get the parent documents of it. System Infoconda list langchain packages in environment at /Users/mweissenba001/anaconda3:Name Version Build Channellangchain 0.1.17 pypi_0 pypi |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 4 replies
-
To implement reranking of child chunks before referencing their parent chunks, you can follow these steps:
This approach ensures that you first rerank the child chunks and then organize them by their parent chunks, avoiding the issue of exceeding the token limit of your reranking model.
|
Beta Was this translation helpful? Give feedback.
-
You can implement your own version of from typing import List
from langchain.retrievers import ParentDocumentRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain.retrievers.multi_vector import SearchType
class ParentDocumentReranker(ParentDocumentRetriever):
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
"""Get documents relevant to a query.
Args:
query: String to find relevant documents for
run_manager: The callbacks handler to use
Returns:
List of relevant documents
"""
if self.search_type == SearchType.mmr:
sub_docs = self.vectorstore.max_marginal_relevance_search(
query, **self.search_kwargs
)
else:
sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
# Reranking logic here with sub_docs (child documents)
...
# We do this to maintain the order of the ids that are returned
ids = []
for d in sub_docs:
if self.id_key in d.metadata and d.metadata[self.id_key] not in ids:
ids.append(d.metadata[self.id_key])
docs = self.docstore.mget(ids)
return [d for d in docs if d is not None] In this code I commented the section where you need to create your reranking logic using sub documents. Let me know if you have any questions ! |
Beta Was this translation helpful? Give feedback.
Hi thanks for your suggestion! I was able to resolve it with modifying the MultiVectorRetriever (multi_vector.py). I added the function "get_matching_reranked_docs" which is returning the right order of the reranked results as Langchain Document with the "score" key in metadata. In "_get_relevant_documents" I implemented the reranking changes.
Here is the full code that works for me: