# Chatbot über den Inhalt eines Buches mit RAG-Architektur
Entwicklung eines KI-Systems, das über reine Textverarbeitung hinausgeht und tiefes inhaltliches Verständnis von literarischen Werken demonstriert

## Import Bibliotheken

In [3]:
import os
import re
import warnings
from typing import List, Dict, Any, Optional

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

# Configuration and environment setup
warnings.filterwarnings('ignore')
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

## Globale Konfigurationsparameter

In [5]:
# Global configuration
PDF_FILE = "Heidi.pdf"
CHUNKS_DB_DIR = "heidi_faiss_chunks"
CHAPTERS_DB_DIR = "heidi_faiss_chapters"
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

## PDF-Verarbeitungsklasse

In [7]:
class PDFProcessor:

    
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
        self.llm = ChatOllama(model="llama3.2", temperature=0.2)
        
    # PDF-LADEFUNKTION 
    def load_pdf_document(self) -> List[Document]:

        if not os.path.exists(self.pdf_path):
            raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
        
        loader = PyPDFLoader(self.pdf_path)
        pages = loader.load()
        print(f"Loaded {len(pages)} pages from {self.pdf_path}")
        return pages
        
    # TEXTBEREINIGUNGSFUNKTION
    def clean_document_text(self, documents: List[Document]) -> List[Document]:

        cleaned_docs = []
        
        for doc in documents:
            text = doc.page_content
            # Entfernen Sie übermäßige Zeilenumbrüche und Leerzeichen
            text = re.sub(r'\n+', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            # Archivwasserzeichen entfernen
            text = re.sub(r'Full Text Archive\s*https://www\.fulltextarchive\.com', '', text)
            # Seitenzahlen entfernen
            text = re.sub(r'^\s*\d+\s*$', '', text)
            
            cleaned_docs.append(Document(page_content=text, metadata=doc.metadata))
        
        print(f"Cleaned {len(cleaned_docs)} documents")
        return cleaned_docs
        
    # TEXT-SEGMENTIERUNGSFUNKTION
    def chunk_documents(self, documents: List[Document], chunk_size: int = 800, chunk_overlap: int = 120) -> List[Document]:
 
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", "!", "?", ";", ",", " ", ""]
        )
        chunks = splitter.split_documents(documents)
        print(f"Created {len(chunks)} text chunks")
        return chunks

## Kapitalextraktionsklasse

In [9]:
class ChapterExtractor:

    # RÖMISCHE ZAHLEN-KONVERTIERUNGSTABELLE 
    ROMAN_NUMERAL_MAP = {
        'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7, 'VIII': 8, 
        'IX': 9, 'X': 10, 'XI': 11, 'XII': 12, 'XIII': 13, 'XIV': 14, 'XV': 15,
        'XVI': 16, 'XVII': 17, 'XVIII': 18, 'XIX': 19, 'XX': 20, 'XXI': 21, 
        'XXII': 22, 'XXIII': 23
    }
    
    def __init__(self):
         # REGEX-PATTERN FÜR KAPITELERKENNUNG
        self.chapter_pattern = re.compile(r'(^|\n)\s*CHAPTER\s+([IVXLCDM]+)\.(.*)?', re.IGNORECASE)

    # KAPITELEXTRAKTIONSFUNKTION
    def extract_chapters_from_text(self, full_text: str) -> List[Dict[str, Any]]:

        print("Extracting chapters from text...")
        
        matches = list(self.chapter_pattern.finditer(full_text))
        if not matches:
            print("No chapter markers found in text")
            return []
        
        chapters = []
        for i, match in enumerate(matches):
            start_pos = match.start()
            end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
            
            roman_numeral = match.group(2).upper()
            chapter_number = self.ROMAN_NUMERAL_MAP.get(roman_numeral, i + 1)
            title_suffix = (match.group(3) or "").strip()
            chapter_title = f"CHAPTER {roman_numeral}." + (f" {title_suffix}" if title_suffix else "")
            chapter_content = full_text[start_pos:end_pos].strip()
            
            chapter_data = {
                "number": chapter_number,
                "roman": roman_numeral,
                "title": chapter_title,
                "content": chapter_content,
                "start_pos": start_pos,
                "end_pos": end_pos
            }
            chapters.append(chapter_data)
            print(f"Found {chapter_title} at position {start_pos}")
        
        print(f"Successfully extracted {len(chapters)} chapters")
        return chapters

## Vector-store-management-klasse

In [11]:

class VectorStoreManager:

    
    def __init__(self, embeddings):
        self.embeddings = embeddings
    # CHUNK-DATENBANK-MANAGEMENT
    def create_or_load_chunk_database(self, pdf_path: str, db_path: str) -> FAISS:

        if os.path.exists(db_path):
            print(f"Loading existing chunk database from {db_path}")
            return FAISS.load_local(db_path, self.embeddings, allow_dangerous_deserialization=True)
        
        print("Creating new chunk database...")
        processor = PDFProcessor(pdf_path)
        pages = processor.load_pdf_document()
        cleaned_pages = processor.clean_document_text(pages)
        chunks = processor.chunk_documents(cleaned_pages)
        
        vector_db = FAISS.from_documents(chunks, self.embeddings)
        vector_db.save_local(db_path)
        print(f"Chunk database saved to {db_path}")
        return vector_db

    # KAPITEL-DATENBANK-MANAGEMENT
    def create_or_load_chapter_database(self, pdf_path: str, db_path: str) -> FAISS:

        if os.path.exists(db_path):
            print(f"Loading existing chapter database from {db_path}")
            return FAISS.load_local(db_path, self.embeddings, allow_dangerous_deserialization=True)
        
        print("Creating new chapter database...")
        processor = PDFProcessor(pdf_path)
        pages = processor.load_pdf_document()
        full_text = "\n".join(page.page_content for page in pages)
        
        extractor = ChapterExtractor()
        chapters = extractor.extract_chapters_from_text(full_text)
        
        chapter_documents = []
        for chapter in chapters:
            # KAPITELTEXT-BEREINIGUNG
            content = re.sub(r'\n+', ' ', chapter['content'])
            content = re.sub(r'\s+', ' ', content).strip()
            # KAPITEL-FORMATIERUNG
            formatted_content = f"""
{'='*60}
CHAPTER {chapter['number']}: {chapter['title']}
{'='*60}

{content}
""".strip()

            doc = Document(
                page_content=formatted_content,
                metadata={
                    "chapter_number": chapter['number'],
                    "chapter_roman": chapter['roman'],
                    "chapter_title": chapter['title'],
                    "start_pos": chapter['start_pos'],
                    "end_pos": chapter['end_pos'],
                    "document_type": "chapter",
                    "source": "position_based"
                }
            )
            chapter_documents.append(doc)
            print(f"Processed Chapter {chapter['number']}: {chapter['title']}")
        
        vector_db = FAISS.from_documents(chapter_documents, self.embeddings)
        vector_db.save_local(db_path)
        print(f"Chapter database saved to {db_path}")
        return vector_db

## Haupt QA-System-Klasse

In [13]:

class BookQASystem:

    
    def __init__(self, chunks_db: FAISS, chapters_db: FAISS, llm):
        self.chunks_db = chunks_db
        self.chapters_db = chapters_db
        self.llm = llm
        self.output_parser = StrOutputParser()
        
        self._setup_prompt_templates()
    
    def _setup_prompt_templates(self):

        
        self.general_prompt = ChatPromptTemplate.from_template("""
        You are a literary analysis expert. Use the provided book information to answer the question.
        If the information doesn't contain the answer, state that you don't know.
        
        Book Information:
        {context}
        
        Question: {question}
        
        Provide a clear and detailed answer:
        """)
        
        self.chapter_prompt = ChatPromptTemplate.from_template("""
        As a chapter analysis specialist, use the following chapter information to answer the question.
        
        Chapter Content:
        {context}
        
        Question about the chapter: {question}
        
        Provide a detailed explanation of events, characters, and key developments:
        """)
        
        self.summary_prompt = ChatPromptTemplate.from_template("""
        Create a summary based on the following book content:
        
        {context}
        
        Question: {question}
        
        Provide a concise and informative summary:
        """)
        
        self.character_prompt = ChatPromptTemplate.from_template("""
        Analyze character information from the book:
        
        {context}
        
        Character Question: {question}
        
        
        """) # Provide detailed character analysis:
    
    def _classify_question_type(self, question: str) -> str:

        question_lower = question.lower()
        
        if any(keyword in question_lower for keyword in ['chapter', 'kapital']):
            return "chapter"
        elif any(keyword in question_lower for keyword in ['summarize', 'summary', 'overview']):
            return "summary"
        elif any(keyword in question_lower for keyword in ['character', 'who is', 'person']):
            return "character"
        else:
            return "general"
    
    def _extract_chapter_number(self, question: str) -> Optional[int]:

        roman_match = re.search(r'\b([IVXLCDM]+)\b', question.upper())
        if roman_match:
            roman_num = roman_match.group(1)
            roman_map = ChapterExtractor.ROMAN_NUMERAL_MAP
            return roman_map.get(roman_num, 1)
        return None
    
    def _search_chapters(self, question: str, chapter_number: Optional[int] = None) -> List[Document]:

        if chapter_number:
            return self.chapters_db.similarity_search(
                question, k=3, filter={"chapter_number": chapter_number}
            )
        else:
            return self.chapters_db.similarity_search(question, k=5)
    
    def _search_chunks(self, question: str) -> List[Document]:

        return self.chunks_db.similarity_search(question, k=8)
    
    def _format_context(self, documents: List[Document]) -> str:

        context_parts = []
        for i, doc in enumerate(documents, 1):
            source_info = ""
            if 'chapter_number' in doc.metadata:
                source_info = f" [Chapter {doc.metadata['chapter_number']}]"
            elif 'page' in doc.metadata:
                source_info = f" [Page {doc.metadata['page']}]"
            
            context_parts.append(f"Excerpt {i}{source_info}:\n{doc.page_content}\n")
        
        return "\n".join(context_parts)
    
    def answer_question(self, question: str) -> str:

        print(f"Processing question: {question}")
        
        question_type = self._classify_question_type(question)
        print(f"Detected question type: {question_type}")
        
        if question_type == "chapter":
            chapter_num = self._extract_chapter_number(question)
            print(f"Searching in Chapter {chapter_num if chapter_num else 'all chapters'}")
            docs = self._search_chapters(question, chapter_num)
            context = self._format_context(docs)
            chain = self.chapter_prompt | self.llm | self.output_parser
            
        elif question_type == "summary":
            print("Gathering content for summary...")
            docs = self._search_chunks(question)
            context = self._format_context(docs)
            chain = self.summary_prompt | self.llm | self.output_parser
            
        elif question_type == "character":
            print("Searching for character information...")
            docs = self._search_chunks(question)
            context = self._format_context(docs)
            chain = self.character_prompt | self.llm | self.output_parser
            
        else:  # general
            print("Searching for general information...")
            docs = self._search_chunks(question)
            context = self._format_context(docs)
            chain = self.general_prompt | self.llm | self.output_parser
        
        return chain.invoke({"context": context, "question": question})


## Systeminitialisierungsfunktion

In [15]:
def initialize_system():

    print("Initializing Book QA System...")
    
    # Initialisieren Sie Einbettungen und Vektorspeicher
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
    store_manager = VectorStoreManager(embeddings)
    
    # DATENBANK-LADUNG/ERSTELLUNG 
    chunks_db = store_manager.create_or_load_chunk_database(PDF_FILE, CHUNKS_DB_DIR)
    chapters_db = store_manager.create_or_load_chapter_database(PDF_FILE, CHAPTERS_DB_DIR)
    
    # LLM-Initialisieren
    llm = ChatOllama(model="llama3.2", temperature=0.2)
    
    # QA-SYSTEM-ERSTELLUNG 
    qa_system = BookQASystem(chunks_db, chapters_db, llm)
    print("QA System initialization complete!")
    return qa_system

## Systemtest-funktion

In [17]:
def run_system_tests(qa_system: BookQASystem):

    test_questions = [
       
       "Who is the author of the book?",
        "What are the names of the goats in the story?",
        
        "Summarize the events of Chapter II",
        "What happened in CHAPTER III?",
        
       "Who is Peter?",
      #  "Describe the main characters",

    ]
    
    print("Running System Tests")

    
    for i, question in enumerate(test_questions, 1):
        print(f"\nTest {i}: {question}")
        try:
            answer = qa_system.answer_question(question)
            print(f"Answer: {answer}")
            print("-" * 50)
        except Exception as e:
            print(f"Error processing question: {e}")
            print("-" * 50)

## Hauptfunction

In [19]:
def main():

    try:
        qa_system = initialize_system()
        
        print("\nStarting system validation tests...")
        run_system_tests(qa_system)
           
    except Exception as e:
        print(f"System initialization failed: {e}")
        return 1
    
    return 0

if __name__ == "__main__":
    exit(main())

Initializing Book QA System...
Loading existing chunk database from heidi_faiss_chunks
Loading existing chapter database from heidi_faiss_chapters
QA System initialization complete!

Starting system validation tests...
Running System Tests

Test 1: Who is the author of the book?
Processing question: Who is the author of the book?
Detected question type: character
Searching for character information...
Answer: Based on Excerpt 4 [Page 1], it appears that the author of the book "Heidi" is Madame Spyri. The excerpt mentions her as the authoress, listing some of her other notable works.

Additionally, Excerpt 5 [Page 1] also confirms that Madame Spyri is the author of "Heidi", stating that she had a peculiar skill in writing simple histories for children and that her book has been a favorite among younger readers in Germany and America.
--------------------------------------------------

Test 2: What are the names of the goats in the story?
Processing question: What are the names of the go