In [1]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import BM25Retriever
from langchain.schema import Document
import os

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = 'your code'

# Initialize components
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(embedding_function=embeddings)
llm = ChatOpenAI(temperature=0)

# Create documents
documents = [
    Document(page_content="AI and its impact on society", metadata={"source": "tech_journal"}),
    Document(page_content="Climate change mitigation strategies", metadata={"source": "science_mag"}),
    Document(page_content="Advancements in quantum computing", metadata={"source": "research_paper"}),
]

# Add documents to vectorstore and create BM25 index
vectorstore.add_documents(documents)
bm25_retriever = BM25Retriever.from_documents(documents)

def rag_fusion(query, k=10):
    # Generate query variations
    query_variations = [
        query,
        f"In the context of {query}, what are the key points?",
        f"Explain {query} in simple terms",
    ]
    
    all_results = []
    
    # Retrieve documents for each query variation
    for q in query_variations:
        vector_results = vectorstore.similarity_search(q, k=k)
        bm25_results = bm25_retriever.get_relevant_documents(q)[:k]
        all_results.extend(vector_results)
        all_results.extend(bm25_results)
    
    # Deduplicate results based on content
    unique_results = []
    seen_content = set()
    for doc in all_results:
        if doc.page_content not in seen_content:
            unique_results.append(doc)
            seen_content.add(doc.page_content)
    
    # Rerank results using reciprocal rank fusion
    rrf_scores = [0] * len(unique_results)
    for i, doc in enumerate(all_results):
        if doc.page_content in seen_content:
            idx = next(i for i, d in enumerate(unique_results) if d.page_content == doc.page_content)
            rrf_scores[idx] += 1 / (i + 1)
    
    # Sort results by RRF score
    sorted_results = sorted(zip(unique_results, rrf_scores), key=lambda x: x[1], reverse=True)
    
    return [doc for doc, score in sorted_results[:k]]

# Perform a query
query = "What are the latest technological advancements?"
fusion_results = rag_fusion(query)

for doc in fusion_results:
    print(f"Retrieved document: {doc.page_content}")
    print(f"Source: {doc.metadata['source']}")
    print()

  warn_deprecated(
  warn_deprecated(
Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3
  warn_deprecated(
Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 10 is greater than number of elements in index 3, updating n_results = 3


Retrieved document: Advancements in quantum computing
Source: research_paper

Retrieved document: AI and its impact on society
Source: tech_journal

Retrieved document: Climate change mitigation strategies
Source: science_mag

