In [None]:
from qdrant_client import QdrantClient, models
import requests
import uuid
import json
import tiktoken
import hashlib


# === Configuration ===
QDRANT_URL = "http://localhost:6333"
EMBEDDING_SVC_URL = "http://127.0.0.1:8000/embed"
COLLECTION_NAME = "multihop_rag_sample"
VECTOR_SIZE = 384
MAX_TOKENS = 256
CHUNK_OVERLAP = 32
LLM = "gpt-4o"


# === UTILS ===
def chunk_text_by_tokens(
    tokenizer: tiktoken.core.Encoding, text: str, max_tokens: int = 256
):

    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks


def chunk_text_with_overlap(
    tokenizer: tiktoken.core.Encoding, 
    text: str, 
    max_tokens=256, 
    overlap=32
):
    tokens = tokenizer.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = tokens[start:end]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap
    return chunks


def generate_chunk_id(url: str, chunk_text: str) -> str:
    """
    Generate a deterministic UUID based on URL + chunk text.
    Ensures same content always yields same ID.
    """
    hash_input = (url + chunk_text).encode("utf-8")
    hash_bytes = hashlib.sha1(hash_input).digest()  # or sha256 for stronger hashing
    return str(uuid.UUID(bytes=hash_bytes[:16]))


tokenizer = tiktoken.encoding_for_model(LLM)
# Initialize Qdrant client
qdrant_client = QdrantClient(url=QDRANT_URL)

In [2]:
# 1. Get Corpus Data
with open("./multihop_rag/corpus_sample.json", "r", encoding="utf-8") as f:
    articles = json.load(f)
    f.close()

In [26]:
points_to_insert = []

for article_idx, article in enumerate(articles):
    body = article.get("body", "")
    if not body:
        continue

    chunks = chunk_text_with_overlap(
        tokenizer=tokenizer, text=body, max_tokens=MAX_TOKENS, overlap=CHUNK_OVERLAP
    )

    payload = {"texts": chunks}
    resp = requests.post(EMBEDDING_SVC_URL, json=payload)
    resp.raise_for_status()
    embeddings = resp.json()["embeddings"]  # list of vectors

    for chunk_id, (chunk_text, emb) in enumerate(zip(chunks, embeddings)):
        # copy the metadata of the article to the chunk
        payload_doc = {k: v for k, v in article.items() if k != "body"}
        payload_doc["chunk_id"] = chunk_id
        payload_doc["num_tokens"] = len(tokenizer.encode(chunk_text))
        payload_doc["body_chunk"] = chunk_text

        # insert chunks
        points_to_insert.append(
            models.PointStruct(id=generate_chunk_id(article["url"], chunk_text), vector=emb, payload=payload_doc)
        )

In [28]:
# Bulk upsert for efficiency
qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points_to_insert)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)