## Setup

#### Libraries

In [13]:
    # from google.colab import userdata
    # import os

    # os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [3]:
%pip install loguru

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3


In [4]:
%pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.15.1-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.15.1


In [10]:
import asyncio
import json
from collections import defaultdict
from pathlib import Path
from statistics import mean
from time import perf_counter
from typing import Any
from uuid import uuid4

import numpy as np
from datasets import Dataset, load_dataset
from loguru import logger
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from sklearn.cluster import KMeans
from tiktoken import Encoding, encoding_for_model, get_encoding

from qdrant_client import AsyncQdrantClient, models

#later
import tqdm

## 1. Create Qdrant collection and retriever

#### Initialize Clients

In [11]:
# Qdrant Client
path: Path = Path("qdrant_client")
qdrant_client: AsyncQdrantClient = AsyncQdrantClient(path=path)

In [14]:
# OpenAI Client
openai_client: AsyncOpenAI = AsyncOpenAI()

#### Create collection

In [15]:
# Embeddings specs
embedding_model: str = "text-embedding-3-small"
dimension: int = 1536
collection_name: str = "speculative_rag"

In [16]:
# Get existing collections
current_collections: models.CollectionsResponse = await qdrant_client.get_collections()

# Create collection
if collection_name not in [col.name for col in current_collections.collections]:
    logger.info("Collection {col} doesn't exist. Creating...", col=collection_name)
    await qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=dimension, distance=models.Distance.DOT
        ),
    )
    logger.info("Collection {col} created!", col=collection_name)
else:
    logger.info(
        "Collection {col} already exists, skipping creation.", col=collection_name
    )

[32m2025-09-24 15:45:26.394[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 1>[0m:[36m6[0m - [1mCollection speculative_rag doesn't exist. Creating...[0m
[32m2025-09-24 15:45:26.412[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 1>[0m:[36m13[0m - [1mCollection speculative_rag created![0m


#### Load dataset

In [17]:
# Load dataset
dataset: Dataset = load_dataset(
    path="jamescalam/ai-arxiv2-semantic-chunks", split="train"
)
print(json.dumps(dataset[0], indent=4))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.jsonl:   0%|          | 0.00/253M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209760 [00:00<?, ? examples/s]

{
    "id": "2401.04088#0",
    "title": "Mixtral of Experts",
    "content": "4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00c3\u00a9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00c3\u00a9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00c3\u00a9e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Eve

In [20]:
# Using only 50k rows
rows_to_keep: int = 2

# Easier to handle as pandas df
records: list[dict[str, Any]] = (
    dataset.to_pandas().iloc[:rows_to_keep].to_dict(orient="records")
)

In [21]:
records[0]

{'id': '2401.04088#0',
 'title': 'Mixtral of Experts',
 'content': '4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, LÃ©lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, ThÃ©ophile Gervet, Thibaut Lavril, Thomas Wang, TimothÃ©e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts

#### Upload information to Qdrant (run only once!)

In [22]:
# Auxiliar functions to prepare the Points
async def create_point(
    client: AsyncOpenAI,
    example: dict[str, Any],
    model: str,
    encoding_name: str,
    max_context_len: int,
) -> models.PointStruct:
    """Creates a Point that contains the payload and the vector."""

    encoding: Encoding = get_encoding(encoding_name=encoding_name)

    embedding_result: Any = await client.embeddings.create(
        input=encoding.encode(text=example.get("content"), disallowed_special=())[
            :max_context_len
        ],
        model=model,
    )
    vector: list[float] = embedding_result.data[0].embedding

    return models.PointStruct(
        id=str(uuid4()),
        vector=vector,
        payload=dict(
            chunk_id=example.get("id"),
            arxiv_id=example.get("arxiv_id"),
            title=example.get("title"),
            content=example.get("content"),
            prechunk_id=example.get("prechunk_id"),
            postchunk_id=example.get("postchunk_id"),
            references=example.get("references").tolist(),
        ),
    )


async def process_batch(
    client: AsyncOpenAI,
    batch: list[dict[str, Any]],
    model: str,
    encoding_name: str,
    max_context_len: int,
) -> list[models.PointStruct]:
    """Processes a batch of examples to create PointStructs."""
    return await asyncio.gather(
        *[
            create_point(
                client=client,
                example=example,
                model=model,
                encoding_name=encoding_name,
                max_context_len=max_context_len,
            )
            for example in batch
        ]
    )


In [23]:
batch_size: int = 512
max_context_len: int = 8192
encoding_name: str = "cl100k_base"
total_batches: int = len(records) // batch_size
all_points: list[models.PointStruct | None] = []

_now: float = perf_counter()
for i in tqdm.tqdm(range(0, len(records), batch_size), total=total_batches, desc="Points"):
    batch: list[dict[str, Any]] = records[i : i + batch_size]
    points: list[models.PointStruct] = await process_batch(
        client=openai_client,
        batch=batch,
        model=embedding_model,
        encoding_name=encoding_name,
        max_context_len=max_context_len,
    )
    all_points.extend(points)
logger.info("Generated all Points in {secs:.4f} seconds.", secs=perf_counter() - _now)

Points: 1it [00:04,  4.14s/it]
[32m2025-09-24 15:46:03.374[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m18[0m - [1mGenerated all Points in 4.1455 seconds.[0m


In [29]:
# Upsert Points
await qdrant_client.upsert(collection_name=collection_name, points=all_points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

#### testing vector search

In [30]:
query: str = "Mixture of Experts"
query_vector: Any = await openai_client.embeddings.create(
    input=query, model=embedding_model
)
query_vector: list[float] = query_vector.data[0].embedding
out: list[models.ScoredPoint] = await qdrant_client.search(
    collection_name=collection_name, query_vector=query_vector, with_vectors=True
)

  out: list[models.ScoredPoint] = await qdrant_client.search(


In [31]:
print(f"Id: {out[0].id}")
print(f"Score: {out[0].score:.3}")
print(f"Title: {out[0].payload.get('title')} [{out[0].payload.get('arxiv_id')}]")
print(f"Chunk: {out[0].payload.get('content')[:1000]} ...")
print(f"Vector: {out[0].vector[:5]} ... ")

Id: 8c63a784-8e85-4d87-9f4e-8ed93eb2a0e4
Score: 0.556
Title: Mixtral of Experts [2401.04088]
Chunk: Code: https://github.com/mistralai/mistral-src Webpage: https://mistral.ai/news/mixtral-of-experts/ # Introduction In this paper, we present Mixtral 8x7B, a sparse mixture of experts model (SMoE) with open weights, licensed under Apache 2.0. Mixtral outperforms Llama 2 70B and GPT-3.5 on most benchmarks. As it only uses a subset of its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput at large batch-sizes. Mixtral is a sparse mixture-of-experts network. It is a decoder-only model where the feedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router network chooses two of these groups (the â ...
Vector: [-0.0048737297765910625, 0.006692185997962952, 0.017368929460644722, -0.03131488338112831, -0.007293880917131901] ... 


## 2. Speculative RAG

#### Multi-Perspective Sampling

In [32]:
def multi_perspective_sampling(
    k: int, retrieved_points: list[models.ScoredPoint], seed: int = 1399
) -> list[list[str]]:
    # Generate clusters
    logger.info("Finding {k} clusters.", k=k)
    algo: Any = KMeans(n_clusters=k, random_state=seed)
    _vectors = [point.vector for point in retrieved_points]
    clusters: list[int] = algo.fit_predict(X=_vectors)

    # Unique clusters
    unique_clusters: set[int] = set(clusters)

    # Create a dictionary with the members of each cluster
    cluster_dict: defaultdict[int, list[int | None]] = defaultdict(list)
    for index, cluster in enumerate(clusters):
        cluster_dict[cluster].append(index)
    logger.info("Clusters distribution: {dist}", dist=dict(cluster_dict))

    # M subsets
    m: int = min(len(indices) for indices in cluster_dict.values())
    logger.info("{m} document subsets will be created.", m=m)

    # Generate m unique subsets without replacement
    np.random.seed(seed=seed)
    subsets: list[list[str]] = []

    for _ in range(m):
        subset: list[int] = []
        for cluster in unique_clusters:
            chosen_element: int = np.random.choice(cluster_dict[cluster])
            subset.append(chosen_element)
            cluster_dict[cluster].remove(chosen_element)
        subset_documents = [
            retrieved_points[idx].payload.get("content") for idx in subset
        ]
        subsets.append(subset_documents)

    return subsets

In [33]:
# Testing
k: int = 2
seed: int = 1399
now: float = perf_counter()
sampled_docs: list[list[str]] = multi_perspective_sampling(
    k=k, retrieved_points=out, seed=seed
)
logger.info(
    "Multi perspective sampling done in {s:.4f} seconds.", s=perf_counter() - now
)

[32m2025-09-24 15:49:23.572[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m5[0m - [1mFinding 2 clusters.[0m
[32m2025-09-24 15:49:23.650[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m17[0m - [1mClusters distribution: {np.int32(0): [0], np.int32(1): [1]}[0m
[32m2025-09-24 15:49:23.651[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m21[0m - [1m1 document subsets will be created.[0m
[32m2025-09-24 15:49:23.653[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m8[0m - [1mMulti perspective sampling done in 0.0807 seconds.[0m


In [34]:
sampled_docs

[['Code: https://github.com/mistralai/mistral-src Webpage: https://mistral.ai/news/mixtral-of-experts/ # Introduction In this paper, we present Mixtral 8x7B, a sparse mixture of experts model (SMoE) with open weights, licensed under Apache 2.0. Mixtral outperforms Llama 2 70B and GPT-3.5 on most benchmarks. As it only uses a subset of its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput at large batch-sizes. Mixtral is a sparse mixture-of-experts network. It is a decoder-only model where the feedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router network chooses two of these groups (the â',
  '4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, G

#### Rag Drafting

In [35]:
rag_drafting_prompt: str = """Response to the instruction. Also provide rationale for your response.
## Instruction: {instruction}

## Evidence: {evidence}"""


class RagDraftingResponse(BaseModel):
    rationale: str = Field(description="Response rationale.")
    response: str = Field(description="Response to the instruction.")


async def rag_drafting_generator(
    client: AsyncOpenAI,
    model_name: str,
    instruction: str,
    evidence: str,
    **kwargs,
) -> tuple[RagDraftingResponse, float]:
    completion: Any = await client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": rag_drafting_prompt.format(
                    instruction=instruction, evidence=evidence
                ),
            }
        ],
        response_format=RagDraftingResponse,
        temperature=0.0,
        logprobs=True,
        max_tokens=512,
        **kwargs,
    )
    return (
        completion.choices[0].message.parsed,
        np.exp(mean(token.logprob for token in completion.choices[0].logprobs.content)),
    )

In [37]:
# Testing
m_drafter: str = "gpt-4o-mini-2024-07-18"
instruction: str = "What is MoE?"

now: float = perf_counter()
rag_drafts: list[tuple[RagDraftingResponse, float]] = await asyncio.gather(
    *[
        rag_drafting_generator(
            client=openai_client,
            model_name=m_drafter,
            instruction=instruction,
            evidence="\n".join(
                [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
            ),
        )
        for subset in sampled_docs
    ]
)
logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - now)
rag_drafts

[32m2025-09-24 15:51:17.591[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m19[0m - [1mRAG Drafting done in 6.5073 seconds.[0m


[(RagDraftingResponse(rationale='The response provides a clear definition of MoE (Mixture of Experts) by explaining its structure and functionality, particularly in the context of the Mixtral model. It highlights how MoE operates by using a subset of parameters for each token, which enhances efficiency and performance. This aligns with the instruction to explain what MoE is, while also incorporating relevant details from the provided evidence.', response='MoE, or Mixture of Experts, is a machine learning architecture that utilizes a subset of its parameters for processing each input token. In the context of the Mixtral model, which is a Sparse Mixture of Experts (SMoE), the architecture consists of multiple feedforward blocks (or experts) at each layer. For every token, a router network selects two of these experts to process the input, allowing the model to leverage a larger number of parameters (47 billion) while only activating a smaller subset (13 billion) during inference. This de

#### Generalist RAG Verifier

In [38]:
rag_verifier_prompt: str = """## Instruction: {instruction}

## Response: {response}

## Rationale: {rationale}

Is the rationale good enough to support the answer? (Yes or No)"""


async def rag_verifier_generator(
    client: AsyncOpenAI,
    model_name: str,
    instruction: str,
    evidence: str,
    response: str,
    rationale: str,
    **kwargs,
) -> tuple[Any, float]:
    encoder: Encoding = encoding_for_model(model_name=model_name)
    completion: Any = await client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": rag_verifier_prompt.format(
                    instruction=instruction,
                    evidence=evidence,
                    response=response,
                    rationale=rationale,
                ),
            }
        ],
        temperature=0.0,
        logprobs=True,
        max_tokens=2,
        **kwargs,
    )
    response: str = completion.choices[0].message.content
    cond: bool = encoder.encode(text=response.lower()) == encoder.encode(text="yes")
    p_yes: float = (
        np.exp(mean(token.logprob for token in completion.choices[0].logprobs.content))
        if cond
        else 0.0
    )  # Naive

    return (response, p_yes)

In [39]:
# Testing
m_verifier: str = "gpt-4o-2024-08-06"
instruction: str = "What is MoE?"

now: float = perf_counter()
rag_verifications: list[tuple[str, float]] = await asyncio.gather(
    *[
        rag_verifier_generator(
            client=openai_client,
            model_name=m_verifier,
            instruction=instruction,
            evidence="\n".join(
                [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
            ),
            response=rag_drafting_response.response,
            rationale=rag_drafting_response.rationale,
        )
        for subset, (rag_drafting_response, _) in zip(sampled_docs, rag_drafts)
    ]
)
logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - now)
rag_verifications

[32m2025-09-24 15:52:15.689[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m21[0m - [1mRAG Drafting done in 2.7521 seconds.[0m


[('Yes', np.float64(0.9999797803764172))]

#### Final Response

In [40]:
best_answer: int = np.argmax(
    p_draft * p_self for (_, p_draft), (_, p_self) in zip(rag_drafts, rag_verifications)
)
print(f"Response:\n ------ \n{rag_drafts[best_answer][0].response}")

Response:
 ------ 
MoE, or Mixture of Experts, is a machine learning architecture that utilizes a subset of its parameters for processing each input token. In the context of the Mixtral model, which is a Sparse Mixture of Experts (SMoE), the architecture consists of multiple feedforward blocks (or experts) at each layer. For every token, a router network selects two of these experts to process the input, allowing the model to leverage a larger number of parameters (47 billion) while only activating a smaller subset (13 billion) during inference. This design enables faster processing speeds and improved performance on various benchmarks.


## 3. "end-to-end" Code

#### Speculative Rag

In [41]:
async def speculative_rag(
    query: str,
    embedding_model: str,
    collection_name: str,
    k: int,
    seed: int,
    client: AsyncOpenAI,
    qdrant_client: AsyncQdrantClient,
    m_drafter: str,
    m_verifier: str,
) -> str:
    _start = perf_counter()

    # Generate query vector embedding
    logger.info("Generating query vector...")
    _now: float = perf_counter()
    query_vector: Any = await client.embeddings.create(
        input=query, model=embedding_model
    )
    query_vector: list[float] = query_vector.data[0].embedding
    logger.info("Query vector generated in {s:.4f} seconds.", s=perf_counter() - _now)

    # Fetching relevant documents
    logger.info("Fetching relevant documents...")
    _now: float = perf_counter()
    out: list[models.ScoredPoint] = await qdrant_client.search(
        collection_name=collection_name, query_vector=query_vector, with_vectors=True
    )
    logger.info("Documents retrieved in {s:.4f} seconds.", s=perf_counter() - _now)

    # Multi Perspective Sampling
    logger.info("Doing Multi Perspective Sampling...")
    _now: float = perf_counter()
    sampled_docs: list[list[str]] = multi_perspective_sampling(
        k=k, retrieved_points=out, seed=seed
    )
    logger.info(
        "Multi Perspective Sampling done in {s:.4f} seconds.", s=perf_counter() - _now
    )

    # RAG Drafting
    logger.info("Doing RAG Drafting...")
    _now: float = perf_counter()
    rag_drafts: list[tuple[RagDraftingResponse, float]] = await asyncio.gather(
        *[
            rag_drafting_generator(
                client=client,
                model_name=m_drafter,
                instruction=query,
                evidence="\n".join(
                    [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
                ),
            )
            for subset in sampled_docs
        ]
    )
    logger.info("RAG Drafting done in {s:.4f} seconds.", s=perf_counter() - _now)

    # RAG Verifier
    logger.info("Doing RAG Verification...")
    _now: float = perf_counter()
    rag_verifications: list[tuple[str, float]] = await asyncio.gather(
        *[
            rag_verifier_generator(
                client=client,
                model_name=m_verifier,
                instruction=query,
                evidence="\n".join(
                    [f"[{idx}] {doc}" for idx, doc in enumerate(subset, start=1)]
                ),
                response=rag_drafting_response.response,
                rationale=rag_drafting_response.rationale,
            )
            for subset, (rag_drafting_response, _) in zip(sampled_docs, rag_drafts)
        ]
    )
    logger.info("RAG Verification done in {s:.4f} seconds.", s=perf_counter() - _now)

    best_answer: int = np.argmax(
        p_draft * p_self
        for (_, p_draft), (_, p_self) in zip(rag_drafts, rag_verifications)
    )
    logger.info("Entire process done in {s:.4f} seconds.", s=perf_counter() - _start)
    print(f"\nQuestion:\n ------ \n{query}\n\n")
    print(f"Response:\n ------ \n{rag_drafts[best_answer][0].response}")
    return rag_drafts[best_answer][0].response

In [42]:
final_answer: str = await speculative_rag(
    query="What is Query2doc?",
    embedding_model=embedding_model,
    collection_name=collection_name,
    k=k,
    seed=seed,
    client=openai_client,
    qdrant_client=qdrant_client,
    m_drafter=m_drafter,
    m_verifier=m_verifier,
)

[32m2025-09-24 15:54:08.099[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m15[0m - [1mGenerating query vector...[0m
[32m2025-09-24 15:54:08.559[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m21[0m - [1mQuery vector generated in 0.4590 seconds.[0m
[32m2025-09-24 15:54:08.560[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m24[0m - [1mFetching relevant documents...[0m
  out: list[models.ScoredPoint] = await qdrant_client.search(
[32m2025-09-24 15:54:08.564[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m29[0m - [1mDocuments retrieved in 0.0026 seconds.[0m
[32m2025-09-24 15:54:08.565[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeculative_rag[0m:[36m32[0m - [1mDoing Multi Perspective Sampling...[0m
[32m2025-09-24 15:54:08.565[0m | [1mINFO    [0m | [36m__main__[0m:[36mmulti_perspective_sampling[0m:[36m5[0m - [1mFinding 2 clusters.[0m
[32m2025-09-24 15:


Question:
 ------ 
What is Query2doc?


Response:
 ------ 
The provided evidence does not contain any information about Query2doc. It primarily discusses the Mixtral model, which is a Sparse Mixture of Experts language model. If you need information about Query2doc, please provide additional context or sources.


#### Base RAG

In [43]:
async def base_rag(
    query: str,
    embedding_model: str,
    collection_name: str,
    client: AsyncOpenAI,
    qdrant_client: AsyncQdrantClient,
    generation_model: str,
) -> str:
    _start = perf_counter()

    # Generate query vector embedding
    logger.info("Generating query vector...")
    _now: float = perf_counter()
    query_vector: Any = await client.embeddings.create(
        input=query, model=embedding_model
    )
    query_vector: list[float] = query_vector.data[0].embedding
    logger.info("Query vector generated in {s:.4f} seconds.", s=perf_counter() - _now)

    # Fetching relevant documents
    logger.info("Fetching relevant documents...")
    _now: float = perf_counter()
    out: list[models.ScoredPoint] = await qdrant_client.search(
        collection_name=collection_name, query_vector=query_vector, with_vectors=True
    )
    logger.info("Documents retrieved in {s:.4f} seconds.", s=perf_counter() - _now)

    # Base RAG
    logger.info("Generating response...")
    prompt: str = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Evidence: {evidence}

    ### Instruction: {instruction}

    ### Response:"""

    completion: Any = await client.chat.completions.create(
        model=generation_model,
        messages=[
            {
                "role": "system",
                "content": prompt.format(
                    instruction=query,
                    evidence="\n".join(
                        [
                            f"[{idx}] {point.payload.get('content')}"
                            for idx, point in enumerate(out, start=1)
                        ]
                    ),
                ),
            }
        ],
        temperature=0.0,
        logprobs=True,
    )
    response: str = completion.choices[0].message.content
    logger.info("Response generated in {s:.4f} seconds.", s=perf_counter() - _now)

    logger.info("Entire process done in {s:.4f} seconds.", s=perf_counter() - _start)
    print(f"\nQuestion:\n ------ \n{query}\n\n")
    print(f"Response:\n ------ \n{response}")
    return response

In [44]:
final_answer: str = await base_rag(
    query="What is Query2doc?",
    embedding_model=embedding_model,
    collection_name=collection_name,
    client=openai_client,
    qdrant_client=qdrant_client,
    generation_model=m_verifier,
)

[32m2025-09-24 15:54:40.710[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m12[0m - [1mGenerating query vector...[0m
[32m2025-09-24 15:54:41.151[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m18[0m - [1mQuery vector generated in 0.4389 seconds.[0m
[32m2025-09-24 15:54:41.152[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m21[0m - [1mFetching relevant documents...[0m
  out: list[models.ScoredPoint] = await qdrant_client.search(
[32m2025-09-24 15:54:41.155[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m26[0m - [1mDocuments retrieved in 0.0012 seconds.[0m
[32m2025-09-24 15:54:41.156[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m29[0m - [1mGenerating response...[0m
[32m2025-09-24 15:54:46.312[0m | [1mINFO    [0m | [36m__main__[0m:[36mbase_rag[0m:[36m58[0m - [1mResponse generated in 5.1588 seconds.[0m
[32m2025-09-24 15:54:46.313[0m | [1mINFO    [0m | [36m__main_


Question:
 ------ 
What is Query2doc?


Response:
 ------ 
Query2doc is not explicitly mentioned in the provided evidence. However, based on the context of the evidence, Query2doc could potentially refer to a system or method related to processing or understanding queries in the context of language models or machine learning. It might involve converting queries into document-like representations or using a model to generate responses to queries. Without specific information from the evidence, this is a speculative interpretation.
