### Implementation

In [1]:
# Adapted from "RAG + Langchain Python Project: Easy AI/Chat For Your Docs"
# https://www.youtube.com/watch?v=tcqEUSNCn8I

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_classic.prompts import ChatPromptTemplate
import openai 
from datasets import Dataset
from dotenv import load_dotenv
import os
import shutil
import numpy as np
import glob


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain_community.document_loaders import PyPDFLoader
import re

# --- Load PDF ---
def load_documents(path: str):
    loader = PyPDFLoader(path)
    page_docs = loader.load()

    # Clean the text — remove newlines and normalize spaces
    for page_doc in page_docs:
        clean_text = re.sub(r'\s+', ' ', page_doc.page_content).strip()
        page_doc.page_content = clean_text

        # keep only the desired metadata
        allowed_keys = {"source", "page"}  # add start_index if using it
        page_doc.metadata = {k: v for k, v in page_doc.metadata.items() if k in allowed_keys}
    return page_docs

In [None]:
manual_path = "data/BoardGamesRuleBook/CATAN.pdf"
docs = load_documents(manual_path)
print(docs[5].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 5}


In [12]:
import json
import ahocorasick  # pip install pyahocorasick


# --- Normalize quotes, apostrophes (often mismatch between JSON and PDF) and remove newlines---
def normalize_text(text):
    text = text.replace("“", '"').replace("”", '"')  # curly quotes → straight
    text = text.replace("’", "'").replace("‘", "'")  # curly apostrophes → straight
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces/newlines
    return text.strip()

# --- Load Training JSON and add it in the documents ---
def load_training_qa_to_docs(training_qas_path: str, page_docs):
    with open(training_qas_path, "r", encoding="utf-8") as f:
        training_data = json.load(f)

    training_qas = training_data.get("training_qas", [])

    # --- Build Aho-Corasick automaton for all relevant chunks ---
    A = ahocorasick.Automaton()
    for qa_idx, q in enumerate(training_qas):
        q["relevance_spans"] = []  # initialize spans
        for chunk_text in q["relevant_chunks"]:
            chunk_text = normalize_text(chunk_text)  # normlize text
            # Store tuple of (qa_idx, chunk_text) as value
            A.add_word(chunk_text, (qa_idx, chunk_text))
    A.make_automaton()

    # --- Search all pages efficiently ---
    for page_doc in page_docs:
        page_text = page_doc.page_content
        page_text = normalize_text(page_text)   # normlize text
        page_num = page_doc.metadata.get("page")
        page_doc.metadata["relevance_spans"] = []   # create new dict to store relevant content's metadata
        for end_idx, (qa_idx, chunk_text) in A.iter(page_text):
            start_idx = end_idx - len(chunk_text)
            span = {
                "qa_id": training_qas[qa_idx]["id"],    # query id that the relevant content resolved
                "page": page_num,                       # page where the content is found
                "start": start_idx,                     # start index of the content
                "end": end_idx,                         # end index of the content
            }
            training_qas[qa_idx]["relevance_spans"].append(span)    # TBD: decide if i need it
            page_doc.metadata["relevance_spans"].append(span)
    return page_docs



In [14]:
training_qa_path = "data/BoardGamesRuleBook/CATAN_train_small.json"
docs_with_qa = load_training_qa_to_docs(training_qa_path, docs)
print(docs_with_qa[6].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 230, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1024, 'end': 1175}]}


In [83]:
# for i in range(10):
#     print(f"{training_qas[i]["id"]}\n{training_qas[i]["question"]}\n{training_qas[i]["relevant_chunks"]}\n{training_qas[i]["relevance_spans"]}")

In [18]:
# --- Step 5: Split documents into chunks ---
def split_text(docs, chunk_size=300, chunk_overlap=30):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,    # decide how to measure the chunk, e.g., character, token, etc
        add_start_index=True,   # add the starting index of the chunk
    )
    chunks  = splitter.split_documents(docs)
    return chunks


In [26]:
chunk_size = 300
chunk_overlap = 30
chunks = split_text(docs_with_qa, chunk_size, chunk_overlap)
print(chunks[50].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 230, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1024, 'end': 1175}], 'start_index': 808}


In [27]:
print(chunks[0].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 0, 'relevance_spans': [], 'start_index': 0}


In [29]:
# --- Step 6: Compute coverage for each chunk ---
def compute_overlap(span_start, span_end, chunk_start, chunk_end):
    """Compute overlap length between a relevance span and a chunk."""
    overlap_start = max(span_start, chunk_start)
    overlap_end = min(span_end, chunk_end)
    return max(0, overlap_end - overlap_start)

def insert_coverage_in_chunks(chunks, ):
    for chunk in chunks:
        chunk_start = chunk.metadata.get("start_index", 0)
        chunk_end = chunk_start + len(chunk.page_content)
        relevance_spans = [span for span in chunk.metadata.get("relevance_spans", [])]

        chunk.metadata["coverage_per_query"] = []  # store spans per doc
        total_coverage = 0.0
        for span in relevance_spans:
            qa_id = span["qa_id"]
            # compute the overlap
            overlap_len = compute_overlap(span["start"], span["end"], chunk_start, chunk_end)
            relevance_len = span["end"] - span["start"]
            coverage = overlap_len / relevance_len if relevance_len > 0 else 0
            total_coverage = max(total_coverage, coverage)  # take max coverage for this chunk
            qa_coverage = {
                "qa_id": qa_id,
                "coverage": coverage
            }
            chunk.metadata["coverage_per_query"].append(qa_coverage)
    return chunks


In [30]:
chunks_with_coverage = insert_coverage_in_chunks(chunks)
for i in range(100):
    print(f"{i}:\n{chunks_with_coverage[i].metadata}")

0:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 0, 'relevance_spans': [], 'start_index': 0, 'coverage_per_query': []}
1:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 0, 'coverage_per_query': []}
2:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 273, 'coverage_per_query': []}
3:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 538, 'coverage_per_query': []}
4:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 813, 'coverage_per_query': []}
5:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 1081, 'coverage_per_query': []}
6:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 1, 'relevance_spans': [], 'start_index': 1348, 'coverage_per_query': []}
7:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 2, 'relevance_spans': [], 

In [33]:
# Apply vector embedding to chunks and save the embedding vector along with the content and metadata to database
def save_to_chroma(chunks: list[Document], CHROMA_PATH: str, model="text-embedding-ada-002"):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        documents=chunks, embedding=OpenAIEmbeddings(model=model), persist_directory=None
    )
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    return db

In [36]:
print(chunks_with_coverage[69].page_content)
print(chunks_with_coverage[69].metadata)

When you play this card, announce one type of resource. Each player must give you all their resource cards of that type. You may only request one type of resource when you play this card, regardless of how many cards you receive. Monopoly When you play this card, build 2 roads at no cost (i.e.,
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 8, 'relevance_spans': [{'qa_id': 'q3', 'page': 8, 'start': 1349, 'end': 1469}, {'qa_id': 'q6', 'page': 8, 'start': 1757, 'end': 1874}, {'qa_id': 'q2', 'page': 8, 'start': 1948, 'end': 2099}, {'qa_id': 'q4', 'page': 8, 'start': 2380, 'end': 2414}, {'qa_id': 'q8', 'page': 8, 'start': 2380, 'end': 2563}], 'start_index': 1350, 'coverage_per_query': [{'qa_id': 'q3', 'coverage': 0.9916666666666667}, {'qa_id': 'q6', 'coverage': 0.0}, {'qa_id': 'q2', 'coverage': 0.0}, {'qa_id': 'q4', 'coverage': 0.0}, {'qa_id': 'q8', 'coverage': 0.0}]}


In [37]:
CHROMA_PATH = "chroma"
from langchain_community.vectorstores.utils import filter_complex_metadata

def prepare_chunks_for_chroma(chunks):
    retrievable_docs = []
    for i, chunk in enumerate(chunks):
        simple_meta = filter_complex_metadata([chunk])[0].metadata  # returns a list of filtered Document(s)
        simple_meta["chunk_id"] = str(i)  # add chunk id to metadata
        retrievable_docs.append(
            Document(
                page_content=chunk.page_content,  # use chunk text
                metadata=simple_meta              # use filtered metadata
            )
        )
    return retrievable_docs

retrieval_chunks = prepare_chunks_for_chroma(chunks_with_coverage)

db = save_to_chroma(retrieval_chunks, CHROMA_PATH)

Saved 100 chunks to chroma.


In [38]:
print(retrieval_chunks[89])

page_content='the ones shown to the left, they would collect the resources shown below: 8 7 6 Designed by: Klaus Teuber (1952-2023) Ongoing design: Benjamin Teuber Development team: Jasmin Balle, Arnd Beenen, Morgan Dontanville, Arnd Fischer, Bianca Freund, and Sonja Krützfeldt Art: Quentin Regnes (cover), Eric' metadata={'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 11, 'start_index': 271, 'chunk_id': '89'}


In [39]:
# query_text = input("Enter your query: ")
query_text = "What does the Monopoly card do?"

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results, similarity too low.")
    # return

print("Top 3 similarity:")
for i, k in enumerate(results):
    if i == 3:
        break
    print(f"Top {i + 1}:\nContent: {k[0].page_content}\nL2 similarity: {k[1]}\nmetadata: {k[0].metadata}\n")

Top 3 similarity:
Top 1:
Content: When you play this card, announce one type of resource. Each player must give you all their resource cards of that type. You may only request one type of resource when you play this card, regardless of how many cards you receive. Monopoly When you play this card, build 2 roads at no cost (i.e.,
L2 similarity: 0.7853654880843317
metadata: {'page': 8, 'chunk_id': '69', 'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'start_index': 1350}

Top 2:
Content: player plays more, they immediately receive this tile. 3:1 2:1 2:1 6 5 5 4 © 2025 CATAN GmbH © 2025 CATAN GmbH Take any 2 resource cards from the supply. INVENTION © 2025 CATAN GmbH Announce one type of resource. Each player must give you all their resource cards of that type. MONOPOLY © 2025 CATAN
L2 similarity: 0.7607942547241171
metadata: {'chunk_id': '65', 'start_index': 273, 'page': 8, 'source': 'data/BoardGamesRuleBook/CATAN.pdf'}

Top 3:
Content: 95 resource cards 3x hills 4x forests 4x pastures 3x 