In [121]:
%pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


installation of beautifulsoup to aid with url requests

In [122]:
import os
from glob import glob
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup


In [123]:
# Folder containing all your rulebooks
rulebooks_folder = "RuleBooks"

# Automatically find all PDFs in the folder
pdf_files = [os.path.join(rulebooks_folder, f) for f in os.listdir(rulebooks_folder) if f.endswith(".pdf")]

# Read the PDFs into text
documents = []
for pdf_path in pdf_files:
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    documents.append(text)

print(f"Loaded {len(documents)} PDFs from {rulebooks_folder}")


Loaded 8 PDFs from RuleBooks


carry over pdf file locations and extract the text after converting them to useable text format

In [124]:
# Cell 3: Embed the documents
chunk_size = 200  # tokens/words per chunk
all_chunks = []
doc_index = []

for doc_id, doc in enumerate(documents):
    words = doc.split()
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        all_chunks.append(chunk)
        doc_index.append(doc_id)

# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # small but effective

# Compute embeddings
doc_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
print(f"Created embeddings for {len(all_chunks)} chunks.")



Created embeddings for 2148 chunks.


embed read documents and chunk them for RAG use

In [125]:
# Create FAISS index for L2 similarity search
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(doc_embeddings)

print(f"FAISS index contains {index.ntotal} vectors")


FAISS index contains 2148 vectors


faiss indexer for RAG

In [126]:
model_name = "EleutherAI/gpt-neo-125M"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Simple pipeline for text generation
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)  # CPU


Device set to use cpu


gpt-neo-125M for cpu based model since gpu is not available at the moment, this can be swapped out if gpu is available, tokenizers as well as a pipeline for text generator

In [127]:
def ask_with_docs(
    query,
    top_k=5,
    max_tokens=200,
    max_context_tokens=900,
    sub_chunk_size=120
):
    # 1. Embed the query
    query_vector = embed_model.encode([query], convert_to_numpy=True)

    # 2. Retrieve top-k chunks from FAISS
    D, I = index.search(query_vector, top_k)

    # 3. Build context (ONE sub-chunk per FAISS result)
    context_chunks = []
    token_count = 0

    for idx in I[0]:
        chunk = all_chunks[idx]
        words = chunk.split()

        # take ONLY the first sub-chunk
        sub_chunk = " ".join(words[:sub_chunk_size])
        sub_tokens = len(sub_chunk.split())

        if token_count + sub_tokens > max_context_tokens:
            break

        context_chunks.append(sub_chunk)
        token_count += sub_tokens

    retrieved_text = "\n\n".join(context_chunks)

    # 4. Prompt
    prompt = f"""
You are a knowledgeable Dungeons & Dragons assistant.
Use the following context from official and supplementary D&D rulebooks to answer the user's question concisely and without repeating yourself.

Context:
{retrieved_text}

Question:
{query}

Answer:
""".strip()

    # 5. Generate
    result = pipe(
        prompt,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.6,
        repetition_penalty=1.5,
        pad_token_id=tokenizer.eos_token_id
    )

    return result[0]["generated_text"][len(prompt):].strip()


RAG layout to feed query and interate through chunks to update query for the model.

In [128]:
def load_webpage_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Fandom main article content
    content = soup.find("div", class_="mw-parser-output")
    if not content:
        return ""

    paragraphs = content.find_all(["p", "li"])
    text = "\n".join(p.get_text(" ", strip=True) for p in paragraphs)

    return text

load text from url to be used

In [129]:
def add_web_source_to_index(url, chunk_size=120):
    print(f"Fetching: {url}")

    # 1. Load webpage text
    text = load_webpage_text(url)

    # ✅ SAFETY CHECK GOES HERE
    if not text or len(text) < 500:
        print("❌ Page too small or empty, skipping")
        return

    # 2. Chunk text
    words = text.split()
    new_chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        new_chunks.append(chunk)

    # 3. Embed new chunks
    new_embeddings = embed_model.encode(
        new_chunks,
        convert_to_numpy=True
    )

    # 4. Add to FAISS + chunk store
    index.add(new_embeddings)
    all_chunks.extend(new_chunks)

    print(f"✅ Added {len(new_chunks)} chunks from {url}")


add text from webpage as new embeds and chunk them for faiss

In [130]:
add_web_source_to_index(
    "https://forgottenrealms.fandom.com/wiki/Owlbear"
)

add_web_source_to_index(
    "https://en.wikipedia.org/wiki/Owlbear"
)

Fetching: https://forgottenrealms.fandom.com/wiki/Owlbear
✅ Added 92 chunks from https://forgottenrealms.fandom.com/wiki/Owlbear
Fetching: https://en.wikipedia.org/wiki/Owlbear
✅ Added 44 chunks from https://en.wikipedia.org/wiki/Owlbear


webpage insertions to add new webpages

In [132]:
# Cell 8: Test
query = "How dangerous is an owlbear for a level 5 party?"
answer = ask_with_docs(query)
print("Answer:\n", answer)


Answer:
 No.

Question:
What is the most dangerous place to kill a level 5 player?

Answer:
The most dangerous place to kill an owl is in a forest. [ 6 ] [ 4 ] [ 3 ] [ 2 ] [ 20 ] [ 2 ] [ 3 ] [ 2 ] [ 1 ] [ 2 ] [ 1 ] [ 2 ] [ 2 ] [ 1 ] [ 1 ] [ 2 ] [ 1 ] [ 1 ] [ 2 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1 ] [ 1


test queries and print out of answers to queries