In [7]:
SEMANTIC_CHUNKING_PROMPT = """
# Role and Objective
You are a **Document Chunking Agent**.  
Your mission is to split the provided specbook content into a sequence of **independent chunks**.  
Each chunk must be self-contained: it can be read and understood without relying on any other chunk from the same document.

# Instructions
## Chunking Rules
1. **Preserve wording** – copy the original text *exactly, word-for-word* inside each chunk.  
2. **Semantic independence** – cut at natural boundaries (sections, headings, paragraphs, topic shifts) so that every chunk stands on its own and does not reference, rely on, or require context from any other chunk.  
3. **Granularity** – aim for coherent units (≈1–3 paragraphs, or 150–400 words) unless the document’s structure suggests another logical size.  
4. **No overlap** – a sentence must appear in one chunk only. Do not duplicate text across chunks.  
5. **Order** – maintain the original order of appearance throughout the chunk list.

## Metadata Requirements
For every chunk you create, add:
- **chunk_id** – integer starting at 1, in original order.  
- **title** – the most representative heading/descriptor for that chunk (use an exact heading if present; otherwise craft a concise, descriptive title).  
- **tags** – a list of 3-8 lowercase keywords or short phrases that capture the main concepts, entities, or topics in that chunk.  
- **content** – the exact excerpt from the document (no alterations, no added explanations).

## Reasoning Steps
1. Scan the entire document to identify natural breakpoints (headings, substantial topic changes, visual section cues).  
2. Decide the cut positions so that each resulting chunk is semantically self-sufficient.  
3. For each chunk:
   - Extract the text verbatim.  
   - Derive a clear, specific **title**.  
   - Select meaningful **tags** that help users quickly grasp the chunk’s focus.  
4. Assemble the final structured output strictly following the format below.
"""

In [11]:
from __future__ import annotations

import asyncio
from typing import Dict, List, Tuple

import numpy as np
import pickle
from tqdm.asyncio import tqdm_asyncio
from tqdm import tqdm

from pydantic import BaseModel
from spec.cache import cache
from spec.config import async_client
from spec.utils.vector_store import VectorStore, Chunk
from spec.utils.llm import LLM

class Specbook(BaseModel):
    specbook_number: str
    content: str

class SpecbookChunk(BaseModel):
    content: str
    tag: list[str]

class SpecbookChunks(BaseModel):
    chunks: List[SpecbookChunk]

In [None]:
async def semantic_chunking(specbooks: list[Specbook], model: str) -> dict[str, list[SpecbookChunk]]:
    sem = asyncio.Semaphore(200)

    async def _one(sb: Specbook) -> tuple[str, list[SpecbookChunk]]:
        async with sem:
            messages = [
                {"role": "system", "content": SEMANTIC_CHUNKING_PROMPT},
                {"role": "user",   "content": sb.content},
            ]
            completion = await async_client.beta.chat.completions.parse(
                model=model,
                messages=messages,
                response_format=SpecbookChunks,
            )
            
            chunks = completion.choices[0].message.parsed
            return sb.specbook_number, chunks.chunks

    tasks = [_one(sb) for sb in specbooks]
    results = await tqdm_asyncio.gather(*tasks, total=len(tasks), desc="Chunking specbooks")
    return dict(results)


async def gen_question(doc_text: str, model: str) -> str:
    QUESTION_PROMPT = f"Create **one** question that can only be answered correctly using the following document:\n\n{doc_text}"

    resp = await async_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": QUESTION_PROMPT}],
    )
    return resp.choices[0].message.content


async def build_eval_set(specbooks: list[Specbook], model: str) -> dict[str, str]:
    async def _one(sb: Specbook) -> tuple[str, str]:
        q = await gen_question(sb.content, model)
        return sb.specbook_number, q

    tasks  = [_one(sb) for sb in specbooks]

    results = await tqdm_asyncio.gather(*tasks, total=len(tasks), desc="Generating questions")
    return dict(results)


def run_search(vs, query: str, top_k: int):
    results = vs.search(query, top_k=top_k)
    return [
        (r["metadata"].get("specbook_number"), r["score"])
        for r in results
    ]


def score_dataset(vs, eval_set: Dict[str, str], top_k: int) -> Tuple[float, float, float, float]:
    total_qs = len(eval_set)
    correct_at_k = 0
    reciprocal_ranks: List[float] = []
    average_precisions: List[float] = []

    for sb_num, question in tqdm(eval_set.items(), desc="Evaluating"):
        retrieved = run_search(vs, question, top_k=top_k)
        retrieved_sb = [r[0] for r in retrieved]

        # Recall / Precision
        if sb_num in retrieved_sb:
            correct_at_k += 1
            rank = retrieved_sb.index(sb_num) + 1 # Get index of the correct specbook
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0)

        # AP cho MAP
        precisions = [
            1 / (i + 1) for i, rsb in enumerate(retrieved_sb) if rsb == sb_num
        ]
        average_precisions.append(np.mean(precisions) if precisions else 0)

    recall_at_k    = correct_at_k / total_qs
    precision_at_k = recall_at_k
    mrr            = float(np.mean(reciprocal_ranks))
    map_score      = float(np.mean(average_precisions))

    return recall_at_k, precision_at_k, mrr, map_score


specbooks: list[Specbook] = list(cache.specbooks.values())[:5]
TOP_K = 10
SEMANTIC_CHUNKING_MODEL = "gpt-4o"
EVALUATION_MODEL = "gpt-4o"
VECTOR_STORE_PICKLE_PATH = "spec_chunks.pkl"

GENERATE_CHUNK = False

vs = VectorStore(pickle_path=VECTOR_STORE_PICKLE_PATH, llm=LLM())

if GENERATE_CHUNK:
    print(">> Generating semantic chunks …")
    spec_to_chunks = await semantic_chunking(specbooks, SEMANTIC_CHUNKING_MODEL)

    print(f">> Loaded {len(specbooks)} specbooks")

    for num, chunks in spec_to_chunks.items():
        for c in chunks:
            vs.add_chunk(
                Chunk(
                    content=c.content,
                    metadata={"specbook_number": num, "tag": c.tag},
                )
            )
    vs.save_index()
    print(">> Vector store saved to spec_chunks.pkl")

print(">> Building evaluation set …")
eval_set = await build_eval_set(specbooks, EVALUATION_MODEL)

print(">> Running retrieval & scoring …")
recall, precision, mrr, map_ = score_dataset(vs, eval_set, TOP_K)

print(f"Recall@{TOP_K}:    {recall:.4f}")
print(f"Precision@{TOP_K}: {precision:.4f}")
print(f"MRR:               {mrr:.4f}")
print(f"MAP:               {map_:.4f}")

3072
{3072: 'text-embedding-3-large', 1536: 'text-embedding-3-small'}
[32mINFO[0m - Loaded VectorStore from 'spec_chunks.pkl'.
[32mINFO[0m - Loaded VectorStore from 'spec_chunks.pkl'.
>> Building evaluation set …


Generating questions:   0%|          | 0/5 [00:00<?, ?it/s]

Generating questions: 100%|██████████| 5/5 [00:01<00:00,  3.31it/s]


>> Running retrieval & scoring …


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[32mINFO[0m - Search completed for query: '**Question:**  
What material is specified for the FRONT_FOG_COVER_LH/RH component described in Specbook VFDSXXBEX1180, and what percentage of filler does it contain?'. Found 2 results.


Evaluating:  20%|██        | 1/5 [00:00<00:01,  2.44it/s]

[32mINFO[0m - Search completed for query: '**Question:**  
According to the component specification described in document VFDSXVEEP9014, what is the maximum background illumination area width of the "ENGINE START STOP" switch, and what is the specified tolerance for this dimension?  

'. Found 0 results.


Evaluating:  40%|████      | 2/5 [00:00<00:00,  4.18it/s]

[32mINFO[0m - Search completed for query: '**Question:**  

What material is specified for the **BRACE_ROD_END_FR_HOOD** component in Specbook VFDSXNDCL1609, and under which manufacturing process is it produced?  '. Found 2 results.


Evaluating:  60%|██████    | 3/5 [00:00<00:00,  4.95it/s]

[32mINFO[0m - Search completed for query: '**Question:**  
What is the tensile strength requirement for Metal Plate AA 10S as specified in the document, and what is the material standard it adheres to?  

**Correct Answer:**  
The tensile strength requirement for Metal Plate AA 10S is **≥ 215 MPa**, and it adheres to the material standard **C1100 (JIS H3100)**.'. Found 0 results.


Evaluating:  80%|████████  | 4/5 [00:00<00:00,  5.53it/s]

[32mINFO[0m - Search completed for query: '**Question:**  

What is the required minimum stall torque for the motor in the component REGULATOR_ASSY_FR_DOOR_WINDOW_LEFT_ANTI_PINCH_GLASS, as specified in SpecBook VFDSXNDCL1592?  

'. Found 2 results.


Evaluating: 100%|██████████| 5/5 [00:00<00:00,  5.21it/s]

Recall@10:    0.6000
Precision@10: 0.6000
MRR:               0.6000
MAP:               0.4500





In [19]:
chunks = vs.search(query_text="What is the tensile strength requirement for Metal Plate AA 10S as specified in the document, and what is the material standard it adheres to?", top_k=10, threshold=0.3)

for c in chunks:
    print(c)

[32mINFO[0m - Search completed for query: 'What is the tensile strength requirement for Metal Plate AA 10S as specified in the document, and what is the material standard it adheres to?'. Found 10 results.
{'faiss_id': np.int64(1017027768588551390), 'content': 'Specification Summary.\n\nTechnical requirements summary  \n1. Refer to DVP for Targets and acceptance criteria.\n\n- Material: Refer to drawing. Recycling & recoverability according to UN Reg No 133. Prohibition & Restriction of Substances according to VFDST00007601. Material validation test.  \n- Functional / Performance: Refer to drawing.  \n- Part quality: Refer to drawing. Tolerances according to VFDST0000013.  \n- Strength: Screw joint residual torque requirement for Plastic / Plastic (Mounting, dismounting test VFDST00009000). WET ROAD DRIVING (Wading/Fording)- VFDST00009202. VEHICLE WASH ABILITY- VFDST00009203.  \n- Durability / Fatigue: Climatic Change according to VFDST00046100. Vibration according to VFDST00062600. 