## Setup

In [None]:
%pip install loguru
%pip install qdrant-client

#### Libraries

In [None]:
import asyncio
import json
from collections import defaultdict
from pathlib import Path
from statistics import mean
from time import perf_counter
from typing import Any
from uuid import uuid4

import numpy as np
from datasets import Dataset, load_dataset

from loguru import logger
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from sklearn.cluster import KMeans
from tiktoken import Encoding, encoding_for_model, get_encoding


from qdrant_client import AsyncQdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## 1. Create Qdrant collection and retriever

#### Initialize Clients

In [14]:
# Qdrant Client
path: Path = Path("qdrant_client")
qdrant_client: AsyncQdrantClient = AsyncQdrantClient(path=path)

RuntimeError: Storage folder qdrant_client is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.

In [3]:
# OpenAI Client
openai_client: AsyncOpenAI = AsyncOpenAI()

#### Create collection

In [4]:
# Embeddings specs
embedding_model: str = "text-embedding-3-small"
dimension: int = 1536
collection_name: str = "speculative_rag"

In [5]:
# Get existing collections
current_collections: models.CollectionsResponse = await qdrant_client.get_collections()

# Create collection
if collection_name not in [col.name for col in current_collections.collections]:
    logger.info("Collection {col} doesn't exist. Creating...", col=collection_name)
    await qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=dimension, distance=models.Distance.DOT
        ),
    )
    logger.info("Collection {col} created!", col=collection_name)
else:
    logger.info(
        "Collection {col} already exists, skipping creation.", col=collection_name
    )

[32m2025-09-11 18:21:57.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mCollection speculative_rag already exists, skipping creation.[0m


#### Load dataset

In [6]:
# Load dataset
dataset: Dataset = load_dataset(
    path="jamescalam/ai-arxiv2-semantic-chunks", split="train"
)
print(json.dumps(dataset[0], indent=4))

{
    "id": "2401.04088#0",
    "title": "Mixtral of Experts",
    "content": "4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00c3\u00a9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00c3\u00a9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00c3\u00a9e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Eve

In [7]:
# Using only 50k rows
rows_to_keep: int = 50_000

# Easier to handle as pandas df
records: list[dict[str, Any]] = (
    dataset.to_pandas().iloc[:rows_to_keep].to_dict(orient="records")
)

In [8]:
records[0]

{'id': '2401.04088#0',
 'title': 'Mixtral of Experts',
 'content': '4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, LÃ©lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, ThÃ©ophile Gervet, Thibaut Lavril, Thomas Wang, TimothÃ©e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts

#### Upload information to Qdrant (run only once!)

In [9]:
# Auxiliar functions to prepare the Points
async def create_point(
    client: AsyncOpenAI,
    example: dict[str, Any],
    model: str,
    encoding_name: str,
    max_context_len: int,
) -> models.PointStruct:
    """Creates a Point that contains the payload and the vector."""

    encoding: Encoding = get_encoding(encoding_name=encoding_name)

    embedding_result: Any = await client.embeddings.create(
        input=encoding.encode(text=example.get("content"), disallowed_special=())[
            :max_context_len
        ],
        model=model,
    )
    vector: list[float] = embedding_result.data[0].embedding

    return models.PointStruct(
        id=str(uuid4()),
        vector=vector,
        payload=dict(
            chunk_id=example.get("id"),
            arxiv_id=example.get("arxiv_id"),
            title=example.get("title"),
            content=example.get("content"),
            prechunk_id=example.get("prechunk_id"),
            postchunk_id=example.get("postchunk_id"),
            references=example.get("references").tolist(),
        ),
    )


async def process_batch(
    client: AsyncOpenAI,
    batch: list[dict[str, Any]],
    model: str,
    encoding_name: str,
    max_context_len: int,
) -> list[models.PointStruct]:
    """Processes a batch of examples to create PointStructs."""
    return await asyncio.gather(
        *[
            create_point(
                client=client,
                example=example,
                model=model,
                encoding_name=encoding_name,
                max_context_len=max_context_len,
            )
            for example in batch
        ]
    )


In [10]:
# batch_size: int = 512
# max_context_len: int = 8192
# encoding_name: str = "cl100k_base"
# total_batches: int = len(records) // batch_size
# all_points: list[models.PointStruct | None] = []

# _now: float = perf_counter()
# for i in tqdm(range(0, len(records), batch_size), total=total_batches, desc="Points"):
#     batch: list[dict[str, Any]] = records[i : i + batch_size]
#     points: list[models.PointStruct] = await process_batch(
#         client=openai_client,
#         batch=batch,
#         model=embedding_model,
#         encoding_name=encoding_name,
#         max_context_len=max_context_len,
#     )
#     all_points.extend(points)
# logger.info("Generated all Points in {secs:.4f} seconds.", secs=perf_counter() - _now)

In [11]:
# Upsert Points
# await qdrant_client.upsert(collection_name=collection_name, points=points)

#### testing vector search

In [12]:
query: str = "Mixture of Experts"
query_vector: Any = await openai_client.embeddings.create(
    input=query, model=embedding_model
)
query_vector: list[float] = query_vector.data[0].embedding
out: list[models.ScoredPoint] = await qdrant_client.search(
    collection_name=collection_name, query_vector=query_vector, with_vectors=True
)

  out: list[models.ScoredPoint] = await qdrant_client.search(


In [13]:
print(f"Id: {out[0].id}")
print(f"Score: {out[0].score:.3}")
print(f"Title: {out[0].payload.get('title')} [{out[0].payload.get('arxiv_id')}]")
print(f"Chunk: {out[0].payload.get('content')[:1000]} ...")
print(f"Vector: {out[0].vector[:5]} ... ")

IndexError: list index out of range

## 2. Speculative RAG

#### Multi-Perspective Sampling

In [None]:
def multi_perspective_sampling(
    k: int, retrieved_points: list[models.ScoredPoint], seed: int = 1399
) -> list[list[str]]:
    # Generate clusters
    logger.info("Finding {k} clusters.", k=k)
    algo: Any = KMeans(n_clusters=k, random_state=seed)
    _vectors = [point.vector for point in retrieved_points]
    clusters: list[int] = algo.fit_predict(X=_vectors)

    # Unique clusters
    unique_clusters: set[int] = set(clusters)

    # Create a dictionary with the members of each cluster
    cluster_dict: defaultdict[int, list[int | None]] = defaultdict(list)
    for index, cluster in enumerate(clusters):
        cluster_dict[cluster].append(index)
    logger.info("Clusters distribution: {dist}", dist=dict(cluster_dict))

    # M subsets
    m: int = min(len(indices) for indices in cluster_dict.values())
    logger.info("{m} document subsets will be created.", m=m)

    # Generate m unique subsets without replacement
    np.random.seed(seed=seed)
    subsets: list[list[str]] = []

    for _ in range(m):
        subset: list[int] = []
        for cluster in unique_clusters:
            chosen_element: int = np.random.choice(cluster_dict[cluster])
            subset.append(chosen_element)
            cluster_dict[cluster].remove(chosen_element)
        subset_documents = [
            retrieved_points[idx].payload.get("content") for idx in subset
        ]
        subsets.append(subset_documents)

    return subsets

In [None]:
# Testing
k: int = 2
seed: int = 1399
now: float = perf_counter()
sampled_docs: list[list[str]] = multi_perspective_sampling(
    k=k, retrieved_points=out, seed=seed
)
logger.info(
    "Multi perspective sampling done in {s:.4f} seconds.", s=perf_counter() - now
)

[32m2024-08-27 17:31:55.087[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m5[0m - [1mFinding 2 clusters.[0m
[32m2024-08-27 17:31:55.118[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m17[0m - [1mClusters distribution: {1: [0, 2, 5, 6, 7], 0: [1, 3, 4, 8, 9]}[0m
[32m2024-08-27 17:31:55.119[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m21[0m - [1m5 document subsets will be created.[0m
[32m2024-08-27 17:31:55.120[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mMulti perspective sampling done in 0.0327 seconds.[0m


In [None]:
sampled_docs

[['Zero-infinity: Breaking the gpu memory wall for extreme scale deep learning. In Proceedings of the Inter- national Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1â 14, 2021. Merity, S., Xiong, C., Bradbury, J., and Socher, R. arXiv preprint Pointer sentinel mixture models. arXiv:1609.07843, 2016. Pagecachemangagement. //code.google.com/archive/p/ pagecache-mangagement/source/default/ source, 2008. Narayan, A., Chami, I., Orr, L., and RÂ´e, C. Can foun- arXiv preprint dation models wrangle your data? arXiv:2205.09911, 2022. Ren, J., Rajbhandari, S., Aminabadi, R. Y., Ruwase, O., Yang, S., Zhang, M., Li, D., and He, Y.',
  'naturalâ choice for the intervention g. Specifically, for each layer â , we intervene on the subspace spanned by â â s top 10 causal basis vectorsâ weâ ll call this the â principal subspaceâ â us- ing a recently proposed method called resampling ablation (Chan et al., 2022). # 5. Applications # 5.1. Extending Overthinking the Trut

#### Rag Drafting

In [None]:
rag_drafting_prompt: str = """Response to the instruction. Also provide rationale for your response.
## Instruction: {instruction}

## Evidence: {evidence}"""


class RagDraftingResponse(BaseModel):
    rationale: str = Field(description="Response rationale.")
    response: str = Field(description="Response to the instruction.")


async def rag_drafting_generator(
    client: AsyncOpenAI,
    model_name: str,
    instruction: str,
    evidence: str,
    **kwargs,
) -> tuple[RagDraftingResponse, float]:
    completion: Any = await client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": rag_drafting_prompt.format(
                    instruction=instruction, evidence=evidence
                ),
            }
        ],
        response_format=RagDraftingResponse,
        temperature=0.0,
        logprobs=True,
        max_tokens=512,
        **kwargs,
    )
    return (
        completion.choices[0].message.parsed,
        np.exp(mean(token.logprob for token in completion.choices[0].logprobs.content)),
    )

In [None]:
# Testing
m_drafter: str = "gpt-4o-mini-2024-07-18"
instruction: str = "What is MoE?"

now: float = perf_counter()
rag_drafts: list[tuple[RagDraftingResponse, float]] = await asyncio.gather(
    *[
        rag_drafting_generator(
            client=openai_client,
            model_name=m_drafter,
            instruction=instruction,
            evidence="\n".join(
                [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
            ),
        )
        for subset in sampled_docs
    ]
)
logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - now)
rag_drafts

[32m2024-08-27 17:31:59.481[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mRAG Drafting done in 4.3440 seconds.[0m


[(RagDraftingResponse(rationale='Mixture of Experts (MoE) is a machine learning architecture that utilizes multiple expert models to improve performance on specific tasks. It allows for dynamic selection of experts based on the input, which can lead to more efficient use of resources and better performance in large-scale models. The evidence provided discusses various aspects of deep learning and model performance, which are relevant to understanding the context in which MoE operates, particularly in high-performance computing and large language models.', response='MoE, or Mixture of Experts, is a machine learning architecture that employs multiple expert models to enhance performance on specific tasks. In this framework, only a subset of experts is activated for each input, allowing for efficient computation and improved model performance, especially in large-scale deep learning applications.'),
  0.6259698619426202),
 (RagDraftingResponse(rationale='MoE, or Mixture of Experts, is a m

#### Generalist RAG Verifier

In [None]:
rag_verifier_prompt: str = """## Instruction: {instruction}

## Response: {response} 

## Rationale: {rationale}

Is the rationale good enough to support the answer? (Yes or No)"""


async def rag_verifier_generator(
    client: AsyncOpenAI,
    model_name: str,
    instruction: str,
    evidence: str,
    response: str,
    rationale: str,
    **kwargs,
) -> tuple[Any, float]:
    encoder: Encoding = encoding_for_model(model_name=model_name)
    completion: Any = await client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": rag_verifier_prompt.format(
                    instruction=instruction,
                    evidence=evidence,
                    response=response,
                    rationale=rationale,
                ),
            }
        ],
        temperature=0.0,
        logprobs=True,
        max_tokens=2,
        **kwargs,
    )
    response: str = completion.choices[0].message.content
    cond: bool = encoder.encode(text=response.lower()) == encoder.encode(text="yes")
    p_yes: float = (
        np.exp(mean(token.logprob for token in completion.choices[0].logprobs.content))
        if cond
        else 0.0
    )  # Naive

    return (response, p_yes)

In [None]:
# Testing
m_verifier: str = "gpt-4o-2024-08-06"
instruction: str = "What is MoE?"

now: float = perf_counter()
rag_verifications: list[tuple[str, float]] = await asyncio.gather(
    *[
        rag_verifier_generator(
            client=openai_client,
            model_name=m_verifier,
            instruction=instruction,
            evidence="\n".join(
                [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
            ),
            response=rag_drafting_response.response,
            rationale=rag_drafting_response.rationale,
        )
        for subset, (rag_drafting_response, _) in zip(sampled_docs, rag_drafts)
    ]
)
logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - now)
rag_verifications

[32m2024-08-27 17:32:00.469[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mRAG Drafting done in 0.9678 seconds.[0m


[('Yes', 0.9999925349918634),
 ('Yes', 0.99999861435166),
 ('Yes', 0.9999989719621285),
 ('Yes', 0.9999989719621285),
 ('Yes', 0.9999965878943213)]

#### Final Response

In [None]:
best_answer: int = np.argmax(
    p_draft * p_self for (_, p_draft), (_, p_self) in zip(rag_drafts, rag_verifications)
)
print(f"Response:\n ------ \n{rag_drafts[best_answer][0].response}")

Response:
 ------ 
MoE, or Mixture of Experts, is a machine learning architecture that employs multiple expert models to enhance performance on specific tasks. In this framework, only a subset of experts is activated for each input, allowing for efficient computation and improved model performance, especially in large-scale deep learning applications.


## 3. "end-to-end" Code

#### Speculative Rag

In [None]:
async def speculative_rag(
    query: str,
    embedding_model: str,
    collection_name: str,
    k: int,
    seed: int,
    client: AsyncOpenAI,
    qdrant_client: AsyncQdrantClient,
    m_drafter: str,
    m_verifier: str,
) -> str:
    _start = perf_counter()

    # Generate query vector embedding
    logger.info("Generating query vector...")
    _now: float = perf_counter()
    query_vector: Any = await client.embeddings.create(
        input=query, model=embedding_model
    )
    query_vector: list[float] = query_vector.data[0].embedding
    logger.info("Query vector generated in {s:.4f} seconds.", s=perf_counter() - _now)

    # Fetching relevant documents
    logger.info("Fetching relevant documents...")
    _now: float = perf_counter()
    out: list[models.ScoredPoint] = await qdrant_client.search(
        collection_name=collection_name, query_vector=query_vector, with_vectors=True
    )
    logger.info("Documents retrieved in {s:.4f} seconds.", s=perf_counter() - _now)

    # Multi Perspective Sampling
    logger.info("Doing Multi Perspective Sampling...")
    _now: float = perf_counter()
    sampled_docs: list[list[str]] = multi_perspective_sampling(
        k=k, retrieved_points=out, seed=seed
    )
    logger.info(
        "Multi Perspective Sampling done in {s:.4f} seconds.", s=perf_counter() - _now
    )

    # RAG Drafting
    logger.info("Doing RAG Drafting...")
    _now: float = perf_counter()
    rag_drafts: list[tuple[RagDraftingResponse, float]] = await asyncio.gather(
        *[
            rag_drafting_generator(
                client=client,
                model_name=m_drafter,
                instruction=query,
                evidence="\n".join(
                    [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
                ),
            )
            for subset in sampled_docs
        ]
    )
    logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - _now)

    # RAG Verifier
    logger.info("Doing RAG Verification...")
    _now: float = perf_counter()
    rag_verifications: list[tuple[str, float]] = await asyncio.gather(
        *[
            rag_verifier_generator(
                client=client,
                model_name=m_verifier,
                instruction=query,
                evidence="\n".join(
                    [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
                ),
                response=rag_drafting_response.response,
                rationale=rag_drafting_response.rationale,
            )
            for subset, (rag_drafting_response, _) in zip(sampled_docs, rag_drafts)
        ]
    )
    logger.info("RAG Verification done in {s:.4f} seconds.", s=perf_counter() - _now)

    best_answer: int = np.argmax(
        p_draft * p_self
        for (_, p_draft), (_, p_self) in zip(rag_drafts, rag_verifications)
    )
    logger.info("Entire process done in {s:.4f} seconds.", s=perf_counter() - _start)
    print(f"\nQuestion:\n ------ \n{query}\n\n")
    print(f"Response:\n ------ \n{rag_drafts[best_answer][0].response}")
    return rag_drafts[best_answer][0].response

In [None]:
final_answer: str = await speculative_rag(
    query="What is Query2doc?",
    embedding_model=embedding_model,
    collection_name=collection_name,
    k=k,
    seed=seed,
    client=openai_client,
    qdrant_client=qdrant_client,
    m_drafter=m_drafter,
    m_verifier=m_verifier,
)

[32m2024-08-27 17:32:00.489[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m15[0m - [1mGenerating query vector...[0m
[32m2024-08-27 17:32:00.828[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m21[0m - [1mQuery vector generated in 0.3382 seconds.[0m
[32m2024-08-27 17:32:00.829[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m24[0m - [1mFetching relevant documents...[0m
[32m2024-08-27 17:32:00.833[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m29[0m - [1mDocuments retrieved in 0.0037 seconds.[0m
[32m2024-08-27 17:32:00.835[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m32[0m - [1mDoing Multi Perspective Sampling...[0m
[32m2024-08-27 17:32:00.836[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m5[0m - [1mFinding 2 clusters.[0m
[32m2024-08-27 17:32:00.853[0m | [1mINFO    [0m | [36m__main__[0m:[36mmult


Question:
 ------ 
What is Query2doc?


Response:
 ------ 
Query2doc is a method designed to enhance information retrieval by leveraging large language models (LLMs) for query expansion. It operates by prompting LLMs with few-shot examples to generate pseudo-documents that are then integrated with existing sparse or dense retrieval systems. The goal is to augment the original queries with these generated documents, thereby improving the retrieval performance. Empirical evaluations have shown that Query2doc consistently leads to improvements across various retrieval models and datasets, despite some limitations regarding efficiency and latency due to the nature of LLM inference.


#### Base RAG

In [None]:
async def base_rag(
    query: str,
    embedding_model: str,
    collection_name: str,
    client: AsyncOpenAI,
    qdrant_client: AsyncQdrantClient,
    generation_model: str,
) -> str:
    _start = perf_counter()

    # Generate query vector embedding
    logger.info("Generating query vector...")
    _now: float = perf_counter()
    query_vector: Any = await client.embeddings.create(
        input=query, model=embedding_model
    )
    query_vector: list[float] = query_vector.data[0].embedding
    logger.info("Query vector generated in {s:.4f} seconds.", s=perf_counter() - _now)

    # Fetching relevant documents
    logger.info("Fetching relevant documents...")
    _now: float = perf_counter()
    out: list[models.ScoredPoint] = await qdrant_client.search(
        collection_name=collection_name, query_vector=query_vector, with_vectors=True
    )
    logger.info("Documents retrieved in {s:.4f} seconds.", s=perf_counter() - _now)

    # Base RAG
    logger.info("Generating response...")
    prompt: str = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Evidence: {evidence} 

    ### Instruction: {instruction}

    ### Response:"""

    completion: Any = await client.chat.completions.create(
        model=generation_model,
        messages=[
            {
                "role": "system",
                "content": prompt.format(
                    instruction=query,
                    evidence="\n".join(
                        [
                            f"[{idx}] {point.payload.get('content')}"
                            for idx, point in enumerate(out, start=1)
                        ]
                    ),
                ),
            }
        ],
        temperature=0.0,
        logprobs=True,
    )
    response: str = completion.choices[0].message.content
    logger.info("Response generated in {s:.4f} seconds.", s=perf_counter() - _now)

    logger.info("Entire process done in {s:.4f} seconds.", s=perf_counter() - _start)
    print(f"\nQuestion:\n ------ \n{query}\n\n")
    print(f"Response:\n ------ \n{response}")
    return response

In [None]:
final_answer: str = await base_rag(
    query="What is Query2doc?",
    embedding_model=embedding_model,
    collection_name=collection_name,
    client=openai_client,
    qdrant_client=qdrant_client,
    generation_model=m_verifier,
)

[32m2024-08-27 17:32:07.345[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m12[0m - [1mGenerating query vector...[0m
[32m2024-08-27 17:32:07.534[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m18[0m - [1mQuery vector generated in 0.1888 seconds.[0m
[32m2024-08-27 17:32:07.534[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m21[0m - [1mFetching relevant documents...[0m
[32m2024-08-27 17:32:07.535[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m26[0m - [1mDocuments retrieved in 0.0007 seconds.[0m
[32m2024-08-27 17:32:07.536[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m29[0m - [1mGenerating response...[0m
[32m2024-08-27 17:32:10.391[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m58[0m - [1mResponse generated in 2.8561 seconds.[0m
[32m2024-08-27 17:32:10.391[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m60[0m - [1mEntire process done 


Question:
 ------ 
What is Query2doc?


Response:
 ------ 
Query2doc is a query expansion approach designed to enhance both sparse and dense retrieval systems. It involves generating pseudo-documents by prompting large language models (LLMs) with few-shot examples. These pseudo-documents are then used to expand the original query, providing additional context and information that can help in disambiguating the query and guiding retrieval systems. The method leverages the knowledge memorization capabilities of LLMs, which are trained on extensive web-scale text corpora. Query2doc has been shown to improve the performance of retrieval models like BM25 and state-of-the-art dense retrievers on various datasets, including MS-MARCO and TREC DL, without requiring model fine-tuning. However, it is noted that the method can be slower due to the need for LLM inference and increased query terms, and it may generate factual errors in some cases.
