In [None]:
%pip install pdfplumber

In [70]:
# Install pdfplumber if not already installed


import pdfplumber
from pathlib import Path

PDF_DIR    = Path("reports")
CHUNKS_DIR = Path("chunks")
CHUNK_SIZE = 1000

# create output folder if needed
CHUNKS_DIR.mkdir(exist_ok=True)

# Loop over every PDF and spit out .txt chunks
for pdf_file in PDF_DIR.glob("*.pdf"):
    print(f"→ Processing {pdf_file.name}")
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    # split and write
    for idx in range(0, len(text), CHUNK_SIZE):
        chunk = text[idx : idx + CHUNK_SIZE].strip()
        if not chunk:
            continue
        out_path = CHUNKS_DIR / f"{pdf_file.stem}_chunk_{idx//CHUNK_SIZE+1}.txt"
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(chunk)

print("✅ Chunking complete.")



→ Processing Z-PY3HdAxsiBv6wN_UniversalRegistrationDocument2024.pdf
→ Processing ABN_AMRO___Integrated_Annual_Report_2024.pdf
→ Processing Barco-IR2024-FULL-V2.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

→ Processing Stellantis-NV-20241231-Annual-Report.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

✅ Chunking complete.


In [71]:
import pandas as pd


CHUNKS_DIR = Path("chunks")

chunk_files = list(CHUNKS_DIR.glob("*.txt"))
print(f"Found {len(chunk_files)} chunk files. Here are the first five:")
df = pd.DataFrame({"chunk_file": [str(f) for f in chunk_files[:5]]})
df  # in Jupyter this will render as a table


Found 5689 chunk files. Here are the first five:


Unnamed: 0,chunk_file
0,chunks/Stellantis-NV-20241231-Annual-Report_ch...
1,chunks/ABN_AMRO___Integrated_Annual_Report_202...
2,chunks/Z-PY3HdAxsiBv6wN_UniversalRegistrationD...
3,chunks/Barco-IR2024-FULL-V2_chunk_706.txt
4,chunks/ABN_AMRO___Integrated_Annual_Report_202...


In [72]:
# 2️⃣ EMBEDDING & INDEXING with NMSLIB instead of FAISS

# ⬇️ Install once
%pip install sentence_transformers nmslib

from sentence_transformers import SentenceTransformer
import nmslib            # ← new
import pickle
from pathlib import Path

# 1️⃣ Load your chunks
CHUNKS_DIR = Path("chunks")
chunk_paths = sorted(CHUNKS_DIR.glob("*.txt"))

# 2️⃣ Load your embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# 3️⃣ Initialize an NMSLIB index (HNSW + cosine)
dim   = embed_model.get_sentence_embedding_dimension()
index = nmslib.init(method="hnsw", space="cosinesimil")

metadata = []

# 4️⃣ Loop through chunks, embed, add to NMSLIB
for i, path in enumerate(chunk_paths):
    text = path.read_text(encoding="utf-8")
    emb  = embed_model.encode(text)        # shape (dim,)
    index.addDataPoint(i, emb)             # add to NMSLIB
    metadata.append({
        "id":      i,
        "file":    str(path),
        "preview": text[:100] + "…"
    })

# 5️⃣ Build the graph (post = number of graph-construction passes)
index.createIndex({"post": 2}, print_progress=True)

# 6️⃣ Persist index & metadata
index.saveIndex("nmslib.index")
with open("nmslib_meta.pkl", "wb") as f:
    pickle.dump(metadata, f)

print(f"✅ Indexed {len(chunk_paths)} chunks into nmslib.index")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.



0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

✅ Indexed 5689 chunks into nmslib.index


In [73]:

# 1️⃣ Load index & metadata
index = nmslib.init(method="hnsw", space="cosinesimil")
index.loadIndex("nmslib.index")
with open("nmslib_meta.pkl", "rb") as f:
    metadata = pickle.load(f)

# 2️⃣ Load the same embedder
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_similar(text, top_k=5):
    # Embed the query
    q_emb = embed_model.encode(text)
    # Query NMSLIB
    ids, distances = index.knnQuery(q_emb, k=top_k)
    results = []
    for idx, dist in zip(ids, distances):
        meta = metadata[idx]
        results.append({
            "score": 1 - dist,       # cosine similarity = 1 - dist
            "id":    meta["id"],
            "file":  meta["file"],
            "preview": meta["preview"]
        })
    return results

# 🧪 Example:
question = "What is the latest Scope 1 greenhouse gas emission?"
for r in retrieve_similar(question, top_k=3):
    print(f"Score {r['score']:.3f}: chunk {r['id']} ({r['preview']})")


Score 0.693: chunk 4994 (y.
— Progress reports are produced annually to monitor changes.
Update frequency The Group’s carbon …)
Score 0.652: chunk 776 (m the generation of purchased or acquired electricity, steam,
heating or cooling.
Scope 3: Indirect …)
Score 0.650: chunk 3755 (ulation that have emerged since the plan’s introduction
in March 2022. Although the targets remain i…)
