In [1]:
from __future__ import annotations

import asyncio
from typing import Dict, List, Tuple

import numpy as np
import pickle
from tqdm.asyncio import tqdm_asyncio
from tqdm import tqdm

from pydantic import BaseModel, Field
from spec.cache import cache
from spec.models import Specbook
from spec.config import async_client
from spec.utils.vector_store import VectorStore, Chunk
from spec.utils.llm import LLM, acompletion_with_backoff
from pathlib import Path
import json

# Semantic Chunking

In [2]:
SEMANTIC_CHUNKING_PROMPT = """
# Role and Objective
You are a **Document Chunking Agent** tasked with performing **Semantic Chunking**—splitting a document into chunks based on meaning and topic continuity.  
Each chunk must focus on one coherent concept, be fully self-contained, and enable precise retrieval.

# Instructions
## Chunking Rules
1. **Semantic Independence** – A reader must understand each chunk without referring to any other chunk.  
2. **Granularity** – Aim for units of roughly 150–400 words (≈ 1–3 paragraphs), unless the document’s natural structure dictates otherwise.  
3. **No Overlap** – A sentence may appear in one chunk only. The `start` and `end` strings must never overlap adjacent chunks.  
4. **Complete Coverage** – Collectively, all chunks must cover the entire document—no omissions.  
5. **Maintain Order** – Preserve the original document order when listing chunks.  
6. **Boundary Uniqueness & Exact Match** –  
   - The `start` and `end` snippets **must be copied *exactly*—character-for-character—from the document**, preserving **all whitespace (including newlines, tabs, multiple spaces), punctuation, and casing**.  
   - Each snippet must be **long enough to be unique** within the document—use **≥ 10 consecutive characters** so the substring appears **only once**.  
   - **Do not replace line breaks (`\n`) with spaces** or alter any characters; if the source text contains a newline, your snippet must contain the same newline at the same position.  
   - Verify programmatically or by careful inspection that both snippets occur **exactly once** in the provided text before finalising the chunk.

## Reasoning Steps
1. Read the entire document to locate natural breakpoints (headings, topic shifts, section dividers, etc.).  
2. Decide chunk boundaries so that every chunk covers a single primary idea.  
3. For each chunk:  
   a. Select precise `start` and `end` snippets that satisfy Rule 6.  
   b. Confirm those snippets are unique and appear **verbatim** in the document.  
   c. Assign an incremental `index` starting from **1**.

# Output Format
If a snippet contains a newline, it may be represented literally or as \\n so long as it matches the document byte-for-byte.

# Final Instructions
Double-check that:
- Every start and end string is an exact, unique match inside the document including all original whitespace characters.
- No text is missing or duplicated across chunks.
- The list of chunks, when reassembled in order, reconstructs the entire original document without alteration.
"""

In [12]:
class Span(BaseModel):
    index: int = Field(..., description="Sequential position of the chunk in the document, starting from 1.")
    start: str = Field(..., description="Exactly text snippet marking the start of the chunk, matching verbatim with the original document. Use 3-10 words")
    end: str = Field(..., description="Exactly text snippet marking the end of the chunk, matching verbatim with the original document. Use 3-10 words")

class Spans(BaseModel):
    spans: List[Span] = Field(..., description="Ordered list of Span instances ensuring semantic independence and full coverage of the original document.")

async def semantic_chunking(specbooks: list[Specbook], model: str) -> dict[str, list[Spans]]:
    sem = asyncio.Semaphore(200)

    async def _one(sb: Specbook) -> tuple[str, list[Spans]] | None:
        try:
            async with sem:
                messages = [
                    {"role": "system", "content": SEMANTIC_CHUNKING_PROMPT},
                    {"role": "user",   "content": sb.content},
                ]
                completion = await acompletion_with_backoff(
                    model=model,
                    messages=messages,
                    response_format=Spans,
                )
                
                spans = completion.choices[0].message.parsed
                return sb.specbook_number, spans.spans
        except Exception as e:
            print(f"Error processing specbook {sb.specbook_number}: {e}")
            return None

    tasks = [_one(sb) for sb in specbooks]
    results = await tqdm_asyncio.gather(*tasks, total=len(tasks), desc="Chunking specbooks")
    
    # Filter out None results from failed tasks
    return dict(r for r in results if r is not None)


In [13]:
from typing import Dict, List

class SpecChunk(BaseModel):
    index: int
    content: str

def parse_chunks(
    document: str,
    spans: list[Span],
) -> list[SpecChunk]:
    """
    Parse chunks by finding start/end positions in original text.
    Returns dict mapping chunk index to extracted text from original document.
    """
    results: list[SpecChunk] = []
    cursor = 0

    # Process chunks in order by index
    for span in sorted(spans, key=lambda s: s.index):
        # Find start position in text
        start_pos = document.find(span.start, cursor)
        if start_pos == -1:
            print(f"Could not find start '{span.start}' for chunk {span.index}")
            break

        # Find end position after start
        end_pos = document.find(span.end, start_pos + len(span.start))
        if end_pos == -1:
            print(f"Could not find end '{span.end}' for chunk {span.index}")
            break

        # Extract document
        snippet = document[start_pos:end_pos + len(span.end)]
        results.append(SpecChunk(index=span.index, content=snippet))

        # Update cursor
        cursor = end_pos + len(span.end)

    return results


# Eval

In [14]:
from pydantic import BaseModel, Field
from typing import List

class Questions(BaseModel):
    questions: List[str] = Field(..., description="List of questions that can only be answered correctly using the document")

async def gen_question(content: str, model: str) -> Questions:
    QUESTION_PROMPT = f"Create **one** question that can only be answered correctly using the following document:\n\n{content}"

    resp = await async_client.beta.chat.completions.parse(
        model=model,
        messages=[{"role": "user", "content": QUESTION_PROMPT}],
        response_format=Questions,
    )
    return resp.choices[0].message.parsed.questions


async def build_eval_set(specbooks: list[Specbook], model: str) -> dict[str, list[str]]:
    async def _one(sb: Specbook) -> tuple[str, list[str]]:
        qs = await gen_question(sb.content, model)
        return sb.specbook_number, qs

    tasks  = [_one(sb) for sb in specbooks]

    results = await tqdm_asyncio.gather(*tasks, total=len(tasks), desc="Generating questions")
    return dict(results)


def run_search(vs: VectorStore, query: str, top_k: int):
    results = vs.search(query, top_k=top_k, threshold=0.3)
    return [
        (r["metadata"].get("specbook_number"), r["score"])
        for r in results
    ]

def score_dataset(vs: VectorStore, eval_set: Dict[str, str], top_k: int) -> Tuple[float, float, float, float]:
    total_qs = len(eval_set)
    correct_at_k = 0
    reciprocal_ranks: List[float] = []
    average_precisions: List[float] = []

    for sb_num, questions in tqdm(eval_set.items(), desc="Evaluating"):
        for question in questions:
            retrieved = run_search(vs, question, top_k=top_k)
            retrieved_sb = [r[0] for r in retrieved]

            # Recall / Precision
            if sb_num in retrieved_sb:
                correct_at_k += 1
                rank = retrieved_sb.index(sb_num) + 1 # Get index of the correct specbook
                reciprocal_ranks.append(1 / rank)
            else:
                reciprocal_ranks.append(0)

            # AP cho MAP
            precisions = [
                1 / (i + 1) for i, rsb in enumerate(retrieved_sb) if rsb == sb_num
            ]
            average_precisions.append(np.mean(precisions) if precisions else 0)

    recall_at_k    = correct_at_k / total_qs
    precision_at_k = recall_at_k
    mrr            = float(np.mean(reciprocal_ranks))
    map_score      = float(np.mean(average_precisions))

    return recall_at_k, precision_at_k, mrr, map_score

In [15]:
TOP_K = 10
SEMANTIC_CHUNKING_MODEL = "o4-mini"
EVALUATION_MODEL = "gpt-4o"
VECTOR_STORE_PICKLE_PATH = f"spec_chunks_{SEMANTIC_CHUNKING_MODEL}.pkl"

GENERATE_CHUNK = True

N = 100 # number of specbooks to use

vs = VectorStore(pickle_path=VECTOR_STORE_PICKLE_PATH, llm=LLM())

num_to_spec = cache.specbooks

# Get a fixed list of specbooks by sorting by number first
specbooks: list[Specbook] = sorted(list(num_to_spec.values()), key=lambda x: x.specbook_number)[:N]

print(len(specbooks))

3072
{3072: 'text-embedding-3-large', 1536: 'text-embedding-3-small'}
100


In [16]:
if GENERATE_CHUNK:
    print(">> Generating semantic chunks …")
    num_to_spans = await semantic_chunking(specbooks, SEMANTIC_CHUNKING_MODEL)


>> Generating semantic chunks …


Chunking specbooks:   2%|▏         | 2/100 [00:20<15:51,  9.71s/it]

Error processing specbook VFDSXNCVC1701: 'NoneType' object has no attribute 'spans'


Chunking specbooks:  11%|█         | 11/100 [00:52<04:22,  2.95s/it]

Error processing specbook VFDSXNBAT0007: 'NoneType' object has no attribute 'spans'


Chunking specbooks: 100%|██████████| 100/100 [04:21<00:00,  2.62s/it]


In [18]:
print(sum([len(spans) for spans in num_to_spans.values()]))

784


In [17]:

print(f">> Loaded {len(specbooks)} specbooks")
all_spec_chunks: list[SpecChunk] = []
for num, spans in num_to_spans.items():
    spec: Specbook = num_to_spec[num]
    chunks = parse_chunks(spec.content, spans)
    all_spec_chunks.extend(chunks)

print(f">> Loaded {len(all_spec_chunks)} chunks")


>> Loaded 100 specbooks
Could not find end '| 29 | TA   | CONCEPT PHASE   | EE - Component Data Sheet inputs                                     | A              | S             | R        | I   | I             |' for chunk 14
Could not find end '**Version 14 (Mar-2023): Update the color codes for Bae Min VFe34 Taxi: Aqua Blue - BLF (item 5.2).' for chunk 4
Could not find end '|         | 3.14.                                                                                                                    |           | Jan/2024  |
</page>' for chunk 4
Could not find start 'Release.  \n\nVersion: 1.0.  ' for chunk 1
Could not find start 'Release.

Version: 01.
Date: July-2024.' for chunk 1
Could not find start '<page number="4">
**Part Quality**  - *RPS & Measuring points:*' for chunk 4
Could not find start '<Specbook> <SpecbookNumber>VFDSXDBEX0007</SpecbookNumber> <SpecbookFiles>' for chunk 1
Could not find end '**Sunlight Simulation:**
VFDST00053800.' for chunk 3
Could not find end 

In [22]:
print(all_spec_chunks[2])

index=3 content='Contents.\n\nSCOPE AND PURPOSE  2  \n\n1 SCOPE OF PRODUCT/ COMMODITY  5  \n1.1 Sales Markets  5  \n1.2 Production volume  5  \n1.3 Build Plan  5  \n\n2 DEFINITIONS AND APPLICABLE DOCUMENTS  5  \n2.1 CAD Data/ Transfer data-CAD Requirements  5  \n2.2 Change Documentation and Part History Documentation  5  \n\n3 REQUIREMENTS  6  \n3.1 Contact  6  \n3.2 Technical requirement  7  \n3.3 General CAE Requirements  9  \n3.4 Material requirement  9  \n3.5 Traceability  9  \n3.6 Labeling of Parts  9  \n3.7 Testing and validation and DVP&R  10  \n3.8 Appearance Approval - Applicable for all Visible Parts  10  \n\n4 FUNCTION SAFETY, HOMOLOGATION AND SAFETY REGULATIONS  10  \n4.1 Functional Safety Requirements  10  \n4.2 Deliverables for Functional Safety  11  \n4.3 Homologation and safety regulation requirements  12  \n\n5 DELIVERABLES  13  \n5.1 Technical deliverables  13  \n5.2 Details deliverables and RASI  14  \n\n6 SUPPLIER RESPONSIBILITIES  16  \n\n7 INTELLECTUAL PROPERTY / 

In [19]:
from spec.utils.utils import num_tokens_from_text
import matplotlib.pyplot as plt
import numpy as np

# Calculate token counts for each chunk
token_counts = [num_tokens_from_text(chunk.content) for chunk in all_spec_chunks]

# Create histogram plot
plt.figure(figsize=(10, 6))
plt.hist(token_counts, bins=30, edgecolor='black')
plt.title('Distribution of Token Counts per Chunk')
plt.xlabel('Number of Tokens')
plt.ylabel('Number of Chunks')

# Add mean and median lines
mean_tokens = np.mean(token_counts)
median_tokens = np.median(token_counts)

plt.axvline(mean_tokens, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_tokens:.1f}')
plt.axvline(median_tokens, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median_tokens:.1f}')

plt.legend()
plt.grid(True, alpha=0.3)

# Print statistics
print(f"Total chunks: {len(token_counts)}")
print(f"Mean tokens per chunk: {mean_tokens:.1f}")
print(f"Median tokens per chunk: {median_tokens:.1f}")
print(f"Min tokens: {min(token_counts)}")
print(f"Max tokens: {max(token_counts)}")


Total chunks: 404
Mean tokens per chunk: 306.2
Median tokens per chunk: 266.0
Min tokens: 12
Max tokens: 1345


In [None]:
all_chunks = [Chunk(content=c.content, metadata={"specbook_number": num, "index": c.index}) for c in all_spec_chunks]

vs.add_chunks(all_chunks)
vs.save_index()
print(">> Vector store saved to spec_chunks.pkl")

In [61]:
print(">> Building evaluation set …")
# eval_set = await build_eval_set(specbooks, EVALUATION_MODEL)

# Save evaluation set to JSON file
# output_path = "eval_set.json"
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(eval_set, f, ensure_ascii=False, indent=2)
# print(f">> Evaluation set saved to {output_path}")


# Load evaluation set from JSON file
input_path = "eval_set.json"
print(f">> Evaluation set loaded from {input_path}")
with open(input_path, "r", encoding="utf-8") as f:
    eval_set = json.load(f)

print(eval_set)

>> Building evaluation set …
>> Evaluation set loaded from eval_set.json


In [None]:
print(">> Running retrieval & scoring …")
recall, precision, mrr, map_ = score_dataset(vs, eval_set, TOP_K)

print(f"Recall@{TOP_K}:    {recall:.4f}")
print(f"Precision@{TOP_K}: {precision:.4f}")
print(f"MRR:               {mrr:.4f}")
print(f"MAP:               {map_:.4f}")

In [55]:
chunks = vs.list_chunks()

print(f"Total chunks: {len(chunks)}")

for c in chunks[:5]:
    print(c["chunk"]["content"])
    print(c["chunk"]["metadata"])
    print("-" * 100)

Total chunks: 3273
Specification Book. 11.01.2022.

SPB-FSS.

Rear Wheel support. BEV. Specification Book (SPB).
{'specbook_number': 'VFDCXXCHS0767', 'index': 1, 'embedding': [0.015685830265283585, -0.027164429426193237, -0.011827878654003143, -0.010152923874557018, 0.014145825058221817, 0.011494475416839123, 0.02211575210094452, 0.0679507628083229, -0.01681305095553398, 0.005985382944345474, 0.049057912081480026, 0.00575914466753602, -0.017448104918003082, 0.013756854459643364, -0.0252116397023201, -0.008525598794221878, -0.021750595420598984, 0.008724052459001541, 0.023528747260570526, -0.04127850383520126, 0.030736608430743217, 0.023338230326771736, -0.024830607697367668, -0.012518499977886677, -0.030943000689148903, -0.0199406910687685, -0.005326514597982168, -0.001899208058603108, -0.020051825791597366, -0.003875813214108348, -0.00037582291406579316, 0.017908519133925438, 0.008692299947142601, -0.00231397757306695, 0.03911931812763214, -0.010565709322690964, 0.03429291024804115, 0

In [33]:
chunks = vs.search(query_text="Name: Nguyễn Văn Sáng. ", top_k=10, threshold=0.3)

for c in chunks:
    print(c['score'], c['content'])


[32mINFO[0m - Search completed for query: 'Name: Nguyễn Văn Sáng. '. Found 2 results.
0.30552274 Release.

Version: 4.0.  
Date: 20/02/2024.  

Name: Nguyễn Văn Sáng.  
Function: VFe36 Exterior Engineer.  
Signature: [Not displayed].  
Date: 21/11/2023.  

Revisions since previous version.

Version 01: Initial Release.  
Signature: [Not displayed].  
Date: 21/11/2023.  

Version 02: Update based on feedback from EI team.  
Signature: [Not displayed].  
Date: 20/12/2023.  

Version 03: Update standard’s name based on feedback from EI team.  
Signature: [Not displayed].  
Date: 25/12/2023.  

Version 04: Adding wading and washing car tests, Material validation test.  
Signature: [Not displayed].  
Date: 20/02/2024.
0.30354416 Release.

Version: 1.0.

Date: Aug - 2021.

Updated by:
Function: Engineer.
Name: Chu Ngoc Tu (v.tucn1@vinsmart.net.vn).

Revisions since previous version:
Main Version: 1.
Sub-Version: 1.0.
Change Description: First edition.
Signature: (Empty).
Date: 02/08/2021.
