### Implementation

In [1]:
# Adapted from "RAG + Langchain Python Project: Easy AI/Chat For Your Docs"
# https://www.youtube.com/watch?v=tcqEUSNCn8I

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_classic.prompts import ChatPromptTemplate
import openai 
from datasets import Dataset
from dotenv import load_dotenv
import os
import shutil
import numpy as np
import glob


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.document_loaders import PyPDFLoader
import re

# --- Load PDF ---
def load_documents(path: str):
    loader = PyPDFLoader(path)
    page_docs = loader.load()

    # Clean the text — remove newlines and normalize spaces
    for page_doc in page_docs:
        clean_text = re.sub(r'\s+', ' ', page_doc.page_content).strip()
        page_doc.page_content = clean_text

        # keep only the desired metadata
        allowed_keys = {"source", "page"}  # add start_index if using it
        page_doc.metadata = {k: v for k, v in page_doc.metadata.items() if k in allowed_keys}
    return page_docs

In [3]:
manual_path = "data/BoardGamesRuleBook/CATAN.pdf"
docs = load_documents(manual_path)
print(docs[5].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 5}


In [4]:
import json
import ahocorasick  # pip install pyahocorasick


# --- Normalize quotes, apostrophes (often mismatch between JSON and PDF) and remove newlines---
def normalize_text(text):
    text = text.replace("“", '"').replace("”", '"')  # curly quotes → straight
    text = text.replace("’", "'").replace("‘", "'")  # curly apostrophes → straight
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces/newlines
    return text.strip()

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
    return None

# --- Load Training JSON and add it in the documents ---
def load_training_qa_to_docs(training_qas_path: str, page_docs):
    training_data = load_json(training_qas_path)
    training_qas = training_data.get("training_qas", [])

    # --- Build Aho-Corasick automaton for all relevant chunks ---
    A = ahocorasick.Automaton()
    for qa_idx, q in enumerate(training_qas):
        q["relevance_spans"] = []  # initialize spans
        for chunk_text in q["relevant_chunks"]:
            chunk_text = normalize_text(chunk_text)  # normlize text
            # Store tuple of (qa_idx, chunk_text) as value
            A.add_word(chunk_text, (qa_idx, chunk_text))
    A.make_automaton()

    # --- Search all pages efficiently ---
    for page_doc in page_docs:
        page_text = page_doc.page_content
        page_text = normalize_text(page_text)   # normlize text
        page_num = page_doc.metadata.get("page")
        page_doc.metadata["relevance_spans"] = []   # create new dict to store relevant content's metadata
        for end_idx, (qa_idx, chunk_text) in A.iter(page_text):
            start_idx = end_idx - len(chunk_text) + 1
            span = {
                "qa_id": training_qas[qa_idx]["id"],    # query id that the relevant content resolved
                "page": page_num,                       # page where the content is found
                "start": start_idx,                     # start index of the content
                "end": end_idx,                         # end index of the content
            }
            page_doc.metadata["relevance_spans"].append(span)
    return page_docs



In [5]:
training_qa_path = "data/BoardGamesRuleBook/CATAN_train_small.json"
docs_with_qa = load_training_qa_to_docs(training_qa_path, docs)
print(docs_with_qa[6].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 231, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1025, 'end': 1175}]}


In [6]:
# --- Step 5: Split documents into chunks ---
def split_text(docs, chunk_size=300, chunk_overlap=30):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,    # decide how to measure the chunk, e.g., character, token, etc
        add_start_index=True,   # add the starting index of the chunk
    )
    chunks  = splitter.split_documents(docs)
    return chunks


In [7]:
chunk_size = 300
chunk_overlap = 30
chunks = split_text(docs_with_qa, chunk_size, chunk_overlap)
print(chunks[50].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 231, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1025, 'end': 1175}], 'start_index': 808}


In [8]:
print(chunks[0].metadata)

{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 0, 'relevance_spans': [], 'start_index': 0}


In [9]:
import copy

# --- Step 6: Compute coverage for each chunk ---
def compute_overlap(span_start, span_end, chunk_start, chunk_end):
    """Compute overlap length between a relevance span and a chunk."""
    overlap_start = max(span_start, chunk_start)
    overlap_end = min(span_end, chunk_end)
    return max(0, overlap_end - overlap_start)

def generate_relevant_chunks_with_coverage(chunks):
    relevant_chunks = []
    for i, chunk in enumerate(chunks):
        chunk_start = chunk.metadata.get("start_index", 0)
        chunk_end = chunk_start + len(chunk.page_content)
        relevance_spans = [span for span in chunk.metadata.get("relevance_spans", [])]

        # skip empty chunks
        if not relevance_spans:
            continue
        tmp = copy.deepcopy(chunk)
        tmp.metadata["coverage_per_query"] = []  # store spans per doc
        for span in relevance_spans:
            qa_id = span["qa_id"]
            # compute the overlap
            overlap_len = compute_overlap(span["start"], span["end"], chunk_start, chunk_end)
            relevance_len = span["end"] - span["start"]
            coverage = overlap_len / relevance_len if relevance_len > 0 else 0
            # skip query with relevance = 0
            if coverage == 0:
                continue
            qa_coverage = {
                "qa_id": qa_id,
                "coverage": coverage
            }
            tmp.metadata["coverage_per_query"].append(qa_coverage)
        # skip chunk with 0 coverage      
        if not tmp.metadata["coverage_per_query"]:
            continue
        tmp.metadata["chunk_id"] = i
        relevant_chunks.append(tmp)
    return relevant_chunks


In [10]:
relevant_chunks = generate_relevant_chunks_with_coverage(chunks)
# for i in range(len(chunks)):
#     print(f"{i}:\n{chunks[i].metadata}")
for i in range(len(relevant_chunks)):
    print(f"{i}:\n{relevant_chunks[i].metadata}")

0:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 5, 'relevance_spans': [{'qa_id': 'q5', 'page': 5, 'start': 2006, 'end': 2079}], 'start_index': 1882, 'coverage_per_query': [{'qa_id': 'q5', 'coverage': 1.0}], 'chunk_id': 44}
1:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 231, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1025, 'end': 1175}], 'start_index': 0, 'coverage_per_query': [{'qa_id': 'q7', 'coverage': 0.5689655172413793}], 'chunk_id': 47}
2:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 231, 'end': 347}, {'qa_id': 'q9', 'page': 6, 'start': 1025, 'end': 1175}], 'start_index': 269, 'coverage_per_query': [{'qa_id': 'q7', 'coverage': 0.6724137931034483}], 'chunk_id': 48}
3:
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 6, 'relevance_spans': [{'qa_id': 'q7', 'page': 6, 'start': 231, 'end': 347}, {'qa_id': 'q9', 'page': 6, '

In [11]:
# Apply vector embedding to chunks and save the embedding vector along with the content and metadata to database
def save_to_chroma(chunks: list[Document], CHROMA_PATH: str, model="text-embedding-ada-002"):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        documents=chunks, embedding=OpenAIEmbeddings(model=model), persist_directory=None
    )
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    return db

In [12]:
print(chunks[69].page_content)
print(chunks[69].metadata)

When you play this card, announce one type of resource. Each player must give you all their resource cards of that type. You may only request one type of resource when you play this card, regardless of how many cards you receive. Monopoly When you play this card, build 2 roads at no cost (i.e.,
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 8, 'relevance_spans': [{'qa_id': 'q3', 'page': 8, 'start': 1350, 'end': 1469}, {'qa_id': 'q6', 'page': 8, 'start': 1758, 'end': 1874}, {'qa_id': 'q2', 'page': 8, 'start': 1949, 'end': 2099}, {'qa_id': 'q4', 'page': 8, 'start': 2381, 'end': 2414}, {'qa_id': 'q8', 'page': 8, 'start': 2381, 'end': 2563}], 'start_index': 1350}


In [13]:
CHROMA_PATH = "chroma"
from langchain_community.vectorstores.utils import filter_complex_metadata

def prepare_chunks_for_chroma(chunks):
    retrievable_docs = []
    for i, chunk in enumerate(chunks):
        simple_meta = filter_complex_metadata([chunk])[0].metadata  # returns a list of filtered Document(s)
        simple_meta["chunk_id"] = i  # add chunk id to metadata
        retrievable_docs.append(
            Document(
                page_content=chunk.page_content,  # use chunk text
                metadata=simple_meta              # use filtered metadata
            )
        )
    return retrievable_docs

chunks_for_chroma = prepare_chunks_for_chroma(chunks)

db = save_to_chroma(chunks_for_chroma, CHROMA_PATH)

Saved 100 chunks to chroma.


In [14]:
print(chunks_for_chroma[89].page_content)
print(chunks_for_chroma[89].metadata)

the ones shown to the left, they would collect the resources shown below: 8 7 6 Designed by: Klaus Teuber (1952-2023) Ongoing design: Benjamin Teuber Development team: Jasmin Balle, Arnd Beenen, Morgan Dontanville, Arnd Fischer, Bianca Freund, and Sonja Krützfeldt Art: Quentin Regnes (cover), Eric
{'source': 'data/BoardGamesRuleBook/CATAN.pdf', 'page': 11, 'start_index': 271, 'chunk_id': 89}


In [15]:
# Search the DB.
def retrieve_top_k(db, query, k=3):
    results = db.similarity_search_with_relevance_scores(query, k=k)
    return [(doc.metadata.get("source", ""), doc.page_content, doc.metadata.get("chunk_id", ""), score)
            for doc, score in results]
    # TBD: raise error


In [16]:
# # query_text = input("Enter your query: ")
# query_text = "What does the Monopoly card do?"
# top_k = retrieve_top_k(db, query_text, k=3)
# for i in range(len(top_k)):
#     print(top_k[i])

In [17]:
import numpy as np

def dcg(scores):
    """Compute Discounted Cumulative Gain."""
    return np.sum([
        (rel) / np.log2(idx + 2)
        for idx, rel in enumerate(scores)
    ])

def ndcg_at_k(retrieved_scores):
    """Compute nDCG@k."""
    ideal_scores = sorted(retrieved_scores, reverse=True)
    dcg_val = dcg(retrieved_scores)
    idcg_val = dcg(ideal_scores)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

In [18]:
def get_coverage(chunk_id, qa_id, relevant_chunks):
    for chunk in relevant_chunks:
        if chunk.metadata.get("chunk_id") != chunk_id:
            continue
        for coverage_entry in chunk.metadata.get("coverage_per_query"):
            if coverage_entry["qa_id"] != qa_id:
                continue
            return coverage_entry["coverage"]
    return 0.0  # default if no match found

In [19]:
training_qa_path = "data/BoardGamesRuleBook/CATAN_train_small.json"
qa = load_json(training_qa_path)
qa = qa["training_qas"]
nb_k = 3

dcg_values = []
ndcg_values = []
for q in qa:
    qa_id = q.get("id")
    qa_q = q.get("question")
    top_k = retrieve_top_k(db, qa_q, k=nb_k)
    score = []
    print(qa_id)
    for chunk_k in top_k:
        print(chunk_k[1])
        chunk_k_id = chunk_k[2]
        chunk_rel = get_coverage(chunk_k_id, qa_id, relevant_chunks)
        score.append(chunk_rel)
    print(score)
    dcg_values.append(dcg(score))
    ndcg_values.append(ndcg_at_k(score))
    print(dcg(score))
    print(ndcg_at_k(score))
print(np.mean(dcg_values))
print(np.mean(ndcg_values))

q1
left does the same until all players have 1 settlement and 1 road on the board. Important: When placing a settlement, stay two edges away from all other settlements. CREDITS 6 Choose the First Player Each player rolls the dice. The player with the highest roll is the first player. Then each player
edge. As before, when placing a settlement, stay two edges away from all other settlements. ROUND 1 The first player places 1 settlement on an empty intersection of their choice. Then they place 1 road on an empty edge next to that settlement. The next player to the left does the same until all
you built this turn. You may play a development card before rolling dice or at any time during the Action phase. Development cards never go back into the supply. If the supply runs out, you may not build any more cards. You may not trade or give away development cards.
[0.0, 1.0, 0.0]
0.6309297535714575
0.6309297535714575
q2
3 of the same resource cards into the supply and take 1 card of a different