## A very basic RAG

You would never build a RAG system this basic. But it helps illustrate the problems we are trying to solve with some of the more advanced techniques.

In [1]:
#%pip install --quiet llama-index llama-index-retrievers-bm25 llama-index-llms-anthropic anthropic

In [2]:
MODEL_ID = "claude-3-7-haiku-latest"

import os
from dotenv import load_dotenv
load_dotenv("../keys.env")
assert os.environ["ANTHROPIC_API_KEY"][:2] == "sk",\
       "Please specify the ANTHROPIC_API_KEY access token in keys.env file"

In [3]:
# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## Utility: cache urls to local directory

In [4]:
import os
import re
import time
import hashlib
import requests
import shutil
from pathlib import Path
from typing import List, Optional, Dict, Union, Tuple, Any
from urllib.parse import urlparse

class CacheManager:
    """
    Manages the local cache for downloaded files.
    
    Attributes:
        cache_dir (Path): Path to the cache directory.
    """
    
    def __init__(self, cache_dir: str = "./.cache"):
        """
        Initialize the cache manager.
        
        Args:
            cache_dir (str): Path to the cache directory. Defaults to "./.cache".
        """
        self.cache_dir = Path(cache_dir)
        self._ensure_cache_dir()
    
    def _ensure_cache_dir(self) -> None:
        """Create the cache directory if it doesn't exist."""
        if not self.cache_dir.exists():
            self.cache_dir.mkdir(parents=True)
            logger.info(f"Created cache directory at {self.cache_dir}")
    
    def _get_cache_filename(self, url: str) -> str:
        """
        Generate a unique filename for a URL.
        
        Args:
            url (str): The URL to generate a filename for.
            
        Returns:
            str: A unique filename based on the URL.
        """
        # Extract the filename from the URL if possible
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.split('/')
        original_filename = path_parts[-1] if path_parts[-1] else "index"
        
        # Create a hash of the URL to ensure uniqueness
        url_hash = hashlib.md5(url.encode()).hexdigest()[:10]
        
        # Combine original filename with hash
        if '.' in original_filename:
            name_parts = original_filename.split('.')
            extension = name_parts[-1]
            base_name = '.'.join(name_parts[:-1])
            return f"{base_name}_{url_hash}.{extension}"
        else:
            return f"{original_filename}_{url_hash}.txt"
    
    def get_cache_path(self, url: str) -> Path:
        """
        Get the cache path for a URL.
        
        Args:
            url (str): The URL to get the cache path for.
            
        Returns:
            Path: The path where the cached file would be stored.
        """
        filename = self._get_cache_filename(url)
        return self.cache_dir / filename
    
    def is_cached(self, url: str) -> bool:
        """
        Check if a URL is already cached.
        
        Args:
            url (str): The URL to check.
            
        Returns:
            bool: True if the URL is cached, False otherwise.
        """
        cache_path = self.get_cache_path(url)
        return cache_path.exists()
    
    def get_cached_content(self, url: str) -> Optional[str]:
        """
        Get the cached content for a URL.
        
        Args:
            url (str): The URL to get the cached content for.
            
        Returns:
            Optional[str]: The cached content if available, None otherwise.
        """
        if not self.is_cached(url):
            return None
        
        cache_path = self.get_cache_path(url)
        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            logger.warning(f"Error reading cached file for {url}: {e}")
            return None
    
    def cache_content(self, url: str, content: str) -> bool:
        """
        Cache content for a URL.
        
        Args:
            url (str): The URL the content was downloaded from.
            content (str): The content to cache.
            
        Returns:
            bool: True if caching was successful, False otherwise.
        """
        self._ensure_cache_dir()
        cache_path = self.get_cache_path(url)
        
        try:
            with open(cache_path, 'w', encoding='utf-8') as f:
                f.write(content)
            logger.info(f"Cached content for {url} at {cache_path}")
            return True
        except Exception as e:
            logger.error(f"Error caching content for {url}: {e}")
            return False
    
    def clear_cache(self) -> bool:
        """
        Clear all cached files.
        
        Returns:
            bool: True if clearing was successful, False otherwise.
        """
        try:
            if self.cache_dir.exists():
                for file_path in self.cache_dir.iterdir():
                    if file_path.is_file():
                        file_path.unlink()
                logger.info("Cache cleared successfully")
            return True
        except Exception as e:
            logger.error(f"Error clearing cache: {e}")
            return False
    
    def get_cache_size(self) -> Tuple[int, str]:
        """
        Get the total size of the cache.
        
        Returns:
            Tuple[int, str]: A tuple containing the size in bytes and a human-readable size.
        """
        total_size = 0
        
        if self.cache_dir.exists():
            for file_path in self.cache_dir.iterdir():
                if file_path.is_file():
                    total_size += file_path.stat().st_size
        
        # Convert to human-readable format
        units = ['B', 'KB', 'MB', 'GB']
        size_human = total_size
        unit_index = 0
        
        while size_human > 1024 and unit_index < len(units) - 1:
            size_human /= 1024
            unit_index += 1
        
        human_readable = f"{size_human:.2f} {units[unit_index]}"
        return total_size, human_readable
    
    def list_cached_files(self) -> List[Dict[str, Any]]:
        """
        List all cached files with metadata.
        
        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing file information.
        """
        files_info = []
        
        if self.cache_dir.exists():
            for file_path in self.cache_dir.iterdir():
                if file_path.is_file():
                    stat = file_path.stat()
                    files_info.append({
                        'filename': file_path.name,
                        'path': str(file_path),
                        'size_bytes': stat.st_size,
                        'last_modified': time.ctime(stat.st_mtime)
                    })
        
        return files_info

## utility: Get text from Project Gutenberg

In [5]:
from llama_index.core import Document
from abc import ABC, abstractmethod

class GutenbergTextLoadError(Exception):
    """Exception raised for errors in loading Gutenberg text files."""
    pass

class DocumentSource(ABC):
    @abstractmethod
    def load_from_url(self, url) -> Document:
        pass

class GutenbergSource(DocumentSource):
    """
    A class to load text files from Project Gutenberg as a LlamaIndex Document.
    
    This class handles fetching text content from URLs, processing Gutenberg-specific
    formatting, and creating a document store indexed by BM25.
    
    Attributes:
        cache_manager (CacheManager): Manager for the local cache.
    """
    
    def __init__(
        self,
        cache_dir: str = "./.cache",
    ):
        self.cache_manager = CacheManager(cache_dir)
   
    def _fetch_text_from_url(self, url: str) -> str:
        """
        Fetch text content from a URL with caching
        
        Args:
            url (str): URL to fetch text from.
            
        Returns:
            str: Text content from the URL.
            
        Raises:
            GutenbergTextLoadError: If there's an error fetching or processing the URL.
        """
        if self.cache_manager.is_cached(url):
            logger.info(f"Loading {url} from cache")
            cached_content = self.cache_manager.get_cached_content(url)
            if cached_content:
                return cached_content
            logger.warning(f"Cached content for {url} could not be read, downloading again")
        
        try:
            logger.info(f"Fetching text from URL: {url}")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            # Check if content is text
            content_type = response.headers.get('Content-Type', '')
            if 'text/plain' not in content_type and 'text/html' not in content_type:
                raise GutenbergTextLoadError(f"URL does not contain text content: {content_type}")
            
            # Detect encoding or use utf-8 as fallback
            encoding = response.encoding or 'utf-8'
            content = response.content.decode(encoding)
        
            # Cache the downloaded content
            self.cache_manager.cache_content(url, content)
            
            return content
        except requests.RequestException as e:
            raise GutenbergTextLoadError(f"Error fetching URL {url}: {str(e)}")
        except UnicodeDecodeError as e:
            raise GutenbergTextLoadError(f"Error decoding content from {url}: {str(e)}")
    
    def _clean_gutenberg_text(self, text: str) -> str:
        """
        Clean Project Gutenberg text by removing headers, footers, and license information.
        
        Args:
            text (str): Raw text from Project Gutenberg.
            
        Returns:
            str: Cleaned text with Gutenberg-specific content removed.
        """
        # Pattern to find the start of the actual content (after header)
        start_markers = [
            r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .+? \*\*\*",
            r"\*\*\* START OF THE PROJECT GUTENBERG .+? \*\*\*",
            r"\*\*\*START OF THE PROJECT GUTENBERG EBOOK .+? \*\*\*",
            r"START OF (THIS|THE) PROJECT GUTENBERG EBOOK"
        ]
        
        # Pattern to find the end of the content (before footer)
        end_markers = [
            r"\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .+? \*\*\*",
            r"\*\*\* END OF THE PROJECT GUTENBERG .+? \*\*\*", 
            r"\*\*\*END OF THE PROJECT GUTENBERG EBOOK .+? \*\*\*",
            r"END OF (THIS|THE) PROJECT GUTENBERG EBOOK"
        ]
        
        # Find start of content
        start_pos = 0
        for marker in start_markers:
            match = re.search(marker, text, re.IGNORECASE)
            if match:
                start_pos = match.end()
                break
        
        # Find end of content
        end_pos = len(text)
        for marker in end_markers:
            match = re.search(marker, text, re.IGNORECASE)
            if match:
                end_pos = match.start()
                break
        
        # Extract and clean the content
        content = text[start_pos:end_pos].strip()
        
        # Remove extra whitespace
        content = re.sub(r'\n{3,}', '\n\n', content)
        
        logger.info(f"Cleaned Gutenberg text: removed {start_pos} chars from start, "
                   f"{len(text) - end_pos} chars from end")
        
        return content
    
    def load_from_url(self, url) -> Document:
        """
        Load text from a URL and return a LlamaIndex Document.
        
        Args:
            url (str, optional): URL to load text from. If None, uses the default URL.
            
        Returns:
            Document
            
        Raises:
            GutenbergTextLoadError: If there's an error loading or processing the text.
        """
        url = url or self.default_url
        
        try:
            # Fetch and clean the text
            raw_text = self._fetch_text_from_url(url)
            cleaned_text = self._clean_gutenberg_text(raw_text)
            
            # Create a document with metadata
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            
            document = Document(
                text=cleaned_text,
                metadata={
                    "source": url,
                    "filename": filename,
                    "date_loaded": time.strftime("%Y-%m-%d %H:%M:%S")
                }
            )
            
            logger.info(f"Successfully loaded text from {url}.")
            
            return document
 
        except Exception as e:
            raise GutenbergTextLoadError(f"Error loading from URL {url}: {str(e)}")

Try reading Anabasis of Alexander https://www.gutenberg.org/cache/epub/46976/pg46976.txt
a 2nd century historical account of Alexander the Great

In [6]:
gs = GutenbergSource()
doc = gs.load_from_url("https://www.gutenberg.org/cache/epub/46976/pg46976.txt")

2025-03-12 01:10:17,601 - INFO - Loading https://www.gutenberg.org/cache/epub/46976/pg46976.txt from cache
2025-03-12 01:10:17,637 - INFO - Cleaned Gutenberg text: removed 1033 chars from start, 18492 chars from end
2025-03-12 01:10:17,639 - INFO - Successfully loaded text from https://www.gutenberg.org/cache/epub/46976/pg46976.txt.


In [7]:
doc.text[21000:22000]

'he calls himself so in _Cynegeticus_ (v.\n6); and in _Periplus_ (xii. 5; xxv. 1), he distinguishes Xenophon by\nthe addition _the elder_. Lucian (_Alexander_, 56) calls Arrian simply\n_Xenophon_. During the stay of the emperor Hadrian at Athens, A.D. 126,\nArrian gained his friendship. He accompanied his patron to Rome, where\nhe received the Roman citizenship. In consequence of this, he assumed\nthe name of Flavius.[2] In the same way the Jewish historian, Josephus,\nhad been allowed by Vespasian and Titus to bear the imperial name\nFlavius.[3]\n\nPhotius says, that Arrian had a distinguished career in Rome, being\nentrusted with various political offices, and at last reaching the\nsupreme dignity of consul under Antoninus Pius.[4] Previous to this\nhe was appointed (A.D. 132) by Hadrian, Governor of Cappadocia, which\nprovince was soon after invaded by the Alani, or Massagetae, whom he\ndefeated and expelled.[5] When Marcus Aurelius came to the throne,\nArrian withdrew into private 

In [8]:
print(doc.id_)

b0851828-29e1-4d33-a050-d313040fc58e


## Step 1: Index document

We will break up the document into chunks, and index it using BM25
See: https://kmwllc.com/index.php/2020/03/20/understanding-tf-idf-and-bm-25/

In [9]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore

class Indexer:
    """
    A class to load documents into LlamaIndex using BM25.
    
    Attributes:
        chunk_size (int): Size of text chunks for processing.
        chunk_overlap (int): Overlap between text chunks.
        docstore (SimpleDocumentStore): Document store for storing processed documents.
    """
    
    def __init__(
        self,
        cache_dir: str = "./.cache",
        chunk_size: int = 1024,
        chunk_overlap: int = 20
    ):
        """
        Initialize the Indexer.
        
        Args:
            chunk_size (int): Size of text chunks for processing. Defaults to 1024.
            chunk_overlap (int): Overlap between text chunks. Defaults to 20.
        """        
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        # Initialize a simple document store
        self.docstore = SimpleDocumentStore()
        
        self.node_parser = SentenceSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        
        logger.info("Indexer initialized")
    

    def add_document_to_index(self, document: Document):
        # Parse the document into nodes
        nodes = self.node_parser.get_nodes_from_documents([document])

        # Add nodes to the document store
        self.docstore.add_documents(nodes)

        logger.info(f"Successfully loaded text from {document.id_} -- {len(nodes)} nodes created.")
            
    def get_docstore(self) -> SimpleDocumentStore:
        return self.docstore

In [10]:
index = Indexer(chunk_size=100, chunk_overlap=20)
index.add_document_to_index(doc)

2025-03-12 01:10:17,987 - INFO - Indexer initialized
2025-03-12 01:10:23,982 - INFO - Successfully loaded text from b0851828-29e1-4d33-a050-d313040fc58e -- 6104 nodes created.


## Step 2: Retrieve nodes that match query

In [11]:
from llama_index.retrievers.bm25 import BM25Retriever
retriever = BM25Retriever.from_defaults(
    docstore=index.get_docstore(),
    similarity_top_k=5)

2025-03-12 01:10:25,975 - DEBUG - Building index from IDs objects


In [12]:
from llama_index.core.response.notebook_utils import display_source_node
retrieved_nodes = retriever.retrieve("Describe the relationship between Alexander and Diogenes")
for node in retrieved_nodes:
    display_source_node(node, 1024)

**Node ID:** ee1ef41e-3e31-4e07-9949-5e585a50651c<br>**Similarity:** 4.2463765144348145<br>**Text:** But Diogenes said that he
wanted nothing else, except that he and his attendants would stand out
of the sunlight. Alexander is said to have expressed his admiration
of Diogenes’s conduct.<br>

**Node ID:** 31bab814-51cd-47cb-ab1a-eb7ab51bcdfc<br>**Similarity:** 4.118840217590332<br>**Text:** 100 stades; and most of it is the mean between
these breadths.[642] This river Indus Alexander crossed at daybreak
with his army into the country of the Indians; concerning whom, in
this history I have described neither what laws they enjoy,<br>

**Node ID:** 005e2f98-94da-4b58-9486-5e7186584eb3<br>**Similarity:** 3.639586925506592<br>**Text:** 32). Alexander said: “If I were
not Alexander, I should like to be Diogenes.” Cf. _Arrian_, i. 1;
Plutarch (_de Fortit. Alex._, p. 331).<br>

**Node ID:** 9946e837-31de-45bb-88ef-26132edd6f20<br>**Similarity:** 3.4104578495025635<br>**Text:** Alexander is said to have expressed his admiration
of Diogenes’s conduct.[832] Thus it is evident that Alexander was
not entirely destitute of better feelings; but he was the slave of
his insatiable ambition.<br>

**Node ID:** 9c5c9882-9eb2-440b-ae1b-89759b623a6e<br>**Similarity:** 3.2550690174102783<br>**Text:** He also ascertained that for
the present Bessus held the supreme command, both on account of his
relationship to Darius and because the war was being carried on in his
viceregal province. Hearing this,<br>

## Step 3: Generate using these nodes

In [13]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.anthropic import Anthropic

llm = Anthropic(
    model="claude-3-7-sonnet-latest",
    api_key=os.environ['ANTHROPIC_API_KEY'],
    temperature=0.2
)

In [14]:
from llama_index.core.llms import ChatMessage
messages = [
    ChatMessage(
        role="system", content="Use the following text to answer the given question."
    )
]
messages += [
    ChatMessage(role="system", content=node.text) for node in retrieved_nodes
]
messages += [
    ChatMessage(role="user", content="Describe the relationship between Alexander and Diogenes.")
]
response = llm.chat(messages)
print(response)

2025-03-12 01:10:30,501 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


assistant: Based on the text, Alexander and Diogenes had a brief but notable interaction. When Alexander met Diogenes, Diogenes simply requested that Alexander and his attendants "stand out of the sunlight" rather than asking for any favors or gifts. Alexander is said to have expressed admiration for Diogenes's conduct, showing respect for the philosopher's simple and independent nature. 

The text also quotes Alexander as saying, "If I were not Alexander, I should like to be Diogenes," suggesting that Alexander respected Diogenes's philosophical approach to life and perhaps even envied his freedom from worldly concerns. The passage notes that this interaction shows that "Alexander was not entirely destitute of better feelings," though he remained "the slave of his insatiable ambition."


## Llama Query engine to simplify Step 3

In [15]:
query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, llm=llm
)

response = query_engine.query("Describe the relationship between Alexander and Diogenes.")
response = {
    "answer": str(response),
    "source_nodes": response.source_nodes
}
print(response['answer'])

2025-03-12 01:10:34,846 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


The relationship between Alexander and Diogenes was marked by a notable encounter where Diogenes requested only that Alexander and his attendants stand out of his sunlight. Rather than being offended by this unusual request from someone addressing such a powerful figure, Alexander expressed admiration for Diogenes's conduct. 

This interaction reveals something about both men's characters. Alexander, despite his immense power and ambition, showed appreciation for Diogenes's simple and independent nature. In fact, Alexander is quoted as saying, "If I were not Alexander, I should like to be Diogenes," suggesting a certain respect for the philosopher's way of life.

While Alexander was described as "not entirely destitute of better feelings," he was nonetheless characterized as "the slave of his insatiable ambition," which contrasts with Diogenes's apparent contentment with merely having access to sunlight.


In [16]:
for node in response['source_nodes']:
    print(node)

Node ID: ee1ef41e-3e31-4e07-9949-5e585a50651c
Text: But Diogenes said that he wanted nothing else, except that he
and his attendants would stand out of the sunlight. Alexander is said
to have expressed his admiration of Diogenes’s conduct.
Score:  4.246

Node ID: 31bab814-51cd-47cb-ab1a-eb7ab51bcdfc
Text: 100 stades; and most of it is the mean between these
breadths.[642] This river Indus Alexander crossed at daybreak with his
army into the country of the Indians; concerning whom, in this history
I have described neither what laws they enjoy,
Score:  4.119

Node ID: 005e2f98-94da-4b58-9486-5e7186584eb3
Text: 32). Alexander said: “If I were not Alexander, I should like to
be Diogenes.” Cf. _Arrian_, i. 1; Plutarch (_de Fortit. Alex._, p.
331).
Score:  3.640

Node ID: 9946e837-31de-45bb-88ef-26132edd6f20
Text: Alexander is said to have expressed his admiration of Diogenes’s
conduct.[832] Thus it is evident that Alexander was not entirely
destitute of better feelings; but he was the slave

## End to end example

In [17]:
def build_query_engine(urls: [str], chunk_size: int) -> RetrieverQueryEngine:
    gs = GutenbergSource()
    index = Indexer(chunk_size=chunk_size, chunk_overlap=chunk_size//10)
    
    for url in urls:
        doc = gs.load_from_url(url)
        index.add_document_to_index(doc)
    
    retriever = BM25Retriever.from_defaults(
        docstore=index.get_docstore(),
        similarity_top_k=5)
    
    llm = Anthropic(
        model="claude-3-7-sonnet-latest",
        api_key=os.environ['ANTHROPIC_API_KEY'],
        temperature=0.2
    )
    
    query_engine = RetrieverQueryEngine.from_args(
        retriever=retriever, llm=llm
    )
    
    return query_engine

def print_response_to_query(query_engine: RetrieverQueryEngine, query: str):
    response = query_engine.query(query)
    response = {
        "answer": str(response),
        "source_nodes": response.source_nodes
    }
    print(response['answer'])
    print("\n\n**Sources**:")
    for node in response['source_nodes']:
        print(node)

In [18]:
query_engine = build_query_engine(["https://www.gutenberg.org/files/53669/53669-0.txt"], 100) # Portable Flame Thrower
print_response_to_query(query_engine, "What should I do if the diaphragm is ruptured?")

2025-03-12 01:10:34,899 - INFO - Indexer initialized
2025-03-12 01:10:34,901 - INFO - Loading https://www.gutenberg.org/files/53669/53669-0.txt from cache
2025-03-12 01:10:34,912 - INFO - Cleaned Gutenberg text: removed 50 chars from start, 49 chars from end
2025-03-12 01:10:34,914 - INFO - Successfully loaded text from https://www.gutenberg.org/files/53669/53669-0.txt.
2025-03-12 01:10:35,498 - INFO - Successfully loaded text from 9d9e10b3-bc9c-4c2d-a645-d95d0ff16755 -- 1208 nodes created.
2025-03-12 01:10:35,995 - DEBUG - Building index from IDs objects
2025-03-12 01:10:38,387 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


If the diaphragm is ruptured, you should replace the safety head with an unbroken head. Additionally, if you notice any tears, separation, or leaks occurring at the diaphragm, you should replace the entire valve-diaphragm assembly.

When handling the diaphragm components, remember to unscrew the diaphragm cap by hand (not with a wrench) and be careful not to disturb the position of the yoke block by turning the needle, as this would affect the valve-needle adjustment.


**Sources**:
Node ID: 6afc9709-b53b-4fc9-8f7e-b1bbf198f9b0
Text: Inspect to see if diaphragm is intact. If diaphragm is ruptured,
replace the safety head with an unbroken head.
Score:  4.869

Node ID: b84ca5bf-c79d-4040-8766-7c528e693559
Text: (3) Unscrew diaphragm cap and pull out washer, support, and
valve-diaphragm assembly. To prevent loss of valve-needle adjustment
(Fig 54), do not disturb position of yoke block by turning the needle.
Score:  3.282

Node ID: 1b07ee25-c1a2-412a-808c-2f5944ca3c99
Text: (Fig 52) Screw

## Limitation 1: Semantic Understanding

Even though "ruptured" is the same as "broken", the returned nodes are very different because the search for "broken" doesn't return the sentences explaining what to do when it's ruptured (or vice-versa).
As a result, the generated answer misses the key point about replacing the safety head.

In [19]:
print_response_to_query(query_engine, "What should I do if the diaphragm is broken?")

2025-03-12 01:10:41,528 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


If the diaphragm is broken, you would need to replace the valve-diaphragm assembly. The proper procedure would involve unscrewing the diaphragm cap and removing the washer, support, and valve-diaphragm assembly. When doing this, it's important not to disturb the position of the yoke block by turning the needle, as this would affect the valve-needle adjustment. After replacing the broken components, you would need to reassemble by placing the valve spring over the end of the needle, installing the spring retainer, and then screwing the diaphragm cap back on by hand without using a wrench.


**Sources**:
Node ID: b84ca5bf-c79d-4040-8766-7c528e693559
Text: (3) Unscrew diaphragm cap and pull out washer, support, and
valve-diaphragm assembly. To prevent loss of valve-needle adjustment
(Fig 54), do not disturb position of yoke block by turning the needle.
Score:  3.282

Node ID: 209034d7-2038-4eeb-a374-f00e84b9a575
Text: (Par 49)    (2) _Spring-case assembly._ If outer case rotates
and inner

## Limitation 2: Chunk size

The results vary quite dramatically depending on the size of the chunks. It's unclear what size of chunk is best for a given a query.

In [20]:
def print_response(chunk_size: int) -> str:
    query_engine = build_query_engine(["https://www.gutenberg.org/files/53669/53669-0.txt"],
                                     chunk_size=chunk_size)
    response = query_engine.query("What should I do if the diaphragm is ruptured?")
    print(response)

print_response(100)

2025-03-12 01:10:41,544 - INFO - Indexer initialized
2025-03-12 01:10:41,546 - INFO - Loading https://www.gutenberg.org/files/53669/53669-0.txt from cache
2025-03-12 01:10:41,554 - INFO - Cleaned Gutenberg text: removed 50 chars from start, 49 chars from end
2025-03-12 01:10:41,556 - INFO - Successfully loaded text from https://www.gutenberg.org/files/53669/53669-0.txt.
2025-03-12 01:10:42,118 - INFO - Successfully loaded text from 079005b9-1d47-42e8-8683-b6f994119c36 -- 1208 nodes created.
2025-03-12 01:10:42,269 - DEBUG - Building index from IDs objects
2025-03-12 01:10:44,814 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


If the diaphragm is ruptured, you should replace the safety head with an unbroken head. Additionally, if you notice any tears, separation, or leaks occurring at the diaphragm, you should replace the entire valve-diaphragm assembly.

When handling the diaphragm components during maintenance, remember to unscrew the diaphragm cap by hand (not using a wrench) and be careful not to disturb the position of the yoke block by turning the needle, as this would affect the valve-needle adjustment.


In [21]:
print_response(200)

2025-03-12 01:10:44,829 - INFO - Indexer initialized
2025-03-12 01:10:44,833 - INFO - Loading https://www.gutenberg.org/files/53669/53669-0.txt from cache
2025-03-12 01:10:44,845 - INFO - Cleaned Gutenberg text: removed 50 chars from start, 49 chars from end
2025-03-12 01:10:44,847 - INFO - Successfully loaded text from https://www.gutenberg.org/files/53669/53669-0.txt.
2025-03-12 01:10:45,141 - INFO - Successfully loaded text from 8ce56344-68a5-41b1-8390-f8833bd84443 -- 376 nodes created.
2025-03-12 01:10:45,210 - DEBUG - Building index from IDs objects
2025-03-12 01:10:47,360 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


If the diaphragm is ruptured, you should replace the safety head with an unbroken head. When inspecting the tank, you'll need to remove the deflector tube from the head (using your hand, not a wrench) to check if the diaphragm is intact. After replacing the head, you'll need to reassemble the plug, head, and deflector tube in the left fuel tank.


In [22]:
print_response(500)

2025-03-12 01:10:47,375 - INFO - Indexer initialized
2025-03-12 01:10:47,376 - INFO - Loading https://www.gutenberg.org/files/53669/53669-0.txt from cache
2025-03-12 01:10:47,387 - INFO - Cleaned Gutenberg text: removed 50 chars from start, 49 chars from end
2025-03-12 01:10:47,389 - INFO - Successfully loaded text from https://www.gutenberg.org/files/53669/53669-0.txt.
2025-03-12 01:10:47,611 - INFO - Successfully loaded text from 16fe36cd-d55f-4147-9000-a7d2c7a7d71c -- 124 nodes created.
2025-03-12 01:10:47,656 - DEBUG - Building index from IDs objects
2025-03-12 01:10:51,259 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


If you find that the diaphragm is ruptured, you should replace the safety head with an unbroken head. After replacement, you'll need to reassemble the plug, head, and deflector tube in the left fuel tank. When reinstalling, the deflector tube should face to the rear at a 45-degree angle to the operator's left shoulder. Remember to screw in the deflector tube by hand only (do not use a wrench on it), and then tighten the lock nut with a wrench.


## Exploring tf-idf


In [39]:
gs = GutenbergSource()
index = Indexer(chunk_size=10000, chunk_overlap=0)
for url in [
    "https://www.gutenberg.org/cache/epub/46976/pg46976.txt", # Alexander
    "https://www.gutenberg.org/cache/epub/6400/pg6400.txt", # Twelve Caesars
    "https://www.gutenberg.org/cache/epub/3296/pg3296.txt", # Augustine
]:
    doc = gs.load_from_url(url)
    index.add_document_to_index(doc)
docstore = index.get_docstore()

2025-03-12 01:18:19,536 - INFO - Indexer initialized
2025-03-12 01:18:19,544 - INFO - Loading https://www.gutenberg.org/cache/epub/46976/pg46976.txt from cache
2025-03-12 01:18:19,609 - INFO - Cleaned Gutenberg text: removed 1033 chars from start, 18492 chars from end
2025-03-12 01:18:19,615 - INFO - Successfully loaded text from https://www.gutenberg.org/cache/epub/46976/pg46976.txt.
2025-03-12 01:18:20,353 - INFO - Successfully loaded text from 29288a92-024f-4523-b578-8836461d3c6f -- 25 nodes created.
2025-03-12 01:18:20,355 - INFO - Loading https://www.gutenberg.org/cache/epub/6400/pg6400.txt from cache
2025-03-12 01:18:20,398 - INFO - Cleaned Gutenberg text: removed 917 chars from start, 18508 chars from end
2025-03-12 01:18:20,399 - INFO - Successfully loaded text from https://www.gutenberg.org/cache/epub/6400/pg6400.txt.
2025-03-12 01:18:21,259 - INFO - Successfully loaded text from 19703ae0-6c83-4faa-b337-2a384bcfb200 -- 35 nodes created.
2025-03-12 01:18:21,261 - INFO - Loading

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
corpus = [str(value.text) for key, value in docstore.docs.items()]
tfidf_vector = tfidf_vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [41]:
tfidf_df.columns[3050]

'astonishment'

In [42]:
tfidf_df[['astonishment']].sum()

astonishment    0.074523
dtype: float64

In [43]:
try:
    tfidf_df["Describe the relationship between Alexander and Diogenes".lower().split()].sum()
except Exception as e:
    print("ERROR:", e)

ERROR: "['describe', 'the', 'between', 'and'] not in index"


In [49]:
tfidf_df["relationship Alexander Diogenes".lower().split()].sum()

relationship    0.044188
alexander       7.415023
diogenes        0.113121
dtype: float64

In [50]:
tfidf_df["relationship wisdom heaven".lower().split()].sum()

relationship    0.044188
wisdom          0.489559
heaven          1.168821
dtype: float64

In [52]:
tfidf_df["Rome Macedonia Persia India".lower().split()].sum()

rome         2.315080
macedonia    0.516891
persia       0.199282
india        0.580187
dtype: float64