In [1]:
import os
from tqdm import tqdm
import requests
import chromadb
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
import logging

In [2]:
# Pfade
BASE_PATH = "C:/Users/linus/Universitaet/LAI/Praktikum/Textgrundlage/Archiv/Archiv2"
CHROMA_PATH = "C:/Users/linus/chroma_db2"

In [3]:
# Logging konfigurieren
logging.basicConfig(level=logging.INFO)

In [4]:
# 1. Lade PDFs und extrahiere Dokumente
def load_documents_from_folder(folder_path):
    loader = PyPDFDirectoryLoader(folder_path)
    documents = loader.load()
    return documents

In [5]:
# 2. Splitte die Dokumente in Chunks
def split_documents(documents, chunk_size=800, chunk_overlap=80):
    print("Splitting documents into smaller chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(documents)
    print(f"Number of chunks: {len(chunks)}")
    return chunks

In [6]:
# 3. Generiere Embeddings mit Ollama
def generate_embeddings_with_ollama(texts, model_name="deepseek-r1:1.5b"):
    embeddings = []
    for text in texts:
        response = requests.post(
            "http://localhost:11434/api/embeddings",
            json={"model": model_name, "prompt": text}
        )
        if response.status_code == 200:
            embedding = response.json().get("embedding", [])
            embeddings.append(embedding)
        else:
            raise Exception(f"Failed to generate embedding: {response.text}")
    return embeddings

In [7]:

# 4. Embeddings erstellen und in Chroma speichern
def embed_and_store(chunks, chroma_path, model_name="deepseek-r1:1.5b"):
    print("Initializing embedding model...")
    
    # Chroma-Client und Sammlung erstellen
    print("Connecting to Chroma database...")
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    collection = chroma_client.get_or_create_collection(name="pdf_documents")
    
    # Embeddings erstellen
    print("Creating embeddings...")
    texts = [chunk.page_content for chunk in chunks]
    embeddings = generate_embeddings_with_ollama(texts, model_name)
    
    for chunk, embedding in zip(chunks, embeddings):
        # Extrahiere den Dokumentnamen und die Seitenzahl aus den Metadaten
        source_filename = os.path.basename(chunk.metadata.get('source', 'unknown'))
        page_number = chunk.metadata.get('page', 0)
        
        collection.add(
            ids=[f"{source_filename}_page_{page_number}"],
            embeddings=[embedding],
            documents=[chunk.page_content],
            metadatas=[{
                "source": source_filename,
                "page": page_number
            }]
        )
    
    print("Embeddings successfully stored in Chroma DB!")

In [8]:
# 5. Verarbeite alle Ordner
def process_all_folders():
    for folder_number in tqdm(range(1, 106), desc="Processing folders", unit="folder"):
        folder_path = os.path.join(BASE_PATH, str(folder_number))
        
        if os.path.exists(folder_path):
            try:
                # Dokumente laden
                documents = load_documents_from_folder(folder_path)
                
                # Dokumente in Chunks splitten
                chunks = split_documents(documents)
                
                # Embeddings erstellen und speichern
                embed_and_store(chunks, CHROMA_PATH)
                
            except Exception as e:
                logging.error(f"Error processing folder {folder_number}: {e}")
        else:
            logging.warning(f"Folder {folder_number} does not exist.")

In [9]:

# 6. Prompt-Template
PROMPT_TEMPLATE = """
Context: 
{context}

Question: {question}

Provide a detailed and precise answer based strictly on the provided context. Include all relevant information and explain the key points comprehensively. If no relevant information is found, say "I cannot find specific information about this in the given context."
"""

In [10]:
# 7. Antwortgenerierung mit Ollama
def generate_answer_with_ollama(prompt, model_name="deepseek-r1:14b"):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model_name, "prompt": prompt}
    )
    if response.status_code == 200:
        return response.json().get("response", "")
    else:
        raise Exception(f"Failed to generate answer: {response.text}")

In [11]:
# 8. RAG-Abfrage
def query_rag(query_text: str, top_k: int = 5):
    # Chroma-Client initialisieren
    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = chroma_client.get_collection(name="pdf_documents")
    
    # Embedding der Suchanfrage
    query_embedding = generate_embeddings_with_ollama([query_text])[0]
    
    # Ähnlichkeitssuche
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    
    # Kontext aus den Dokumenten extrahieren
    context_text = "\n\n---\n\n".join(results['documents'][0])
    
    # Prompt template vorbereiten
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    
    # LLM Aufruf mit Ollama
    response_text = generate_answer_with_ollama(prompt)
    
    # Quellen extrahieren
    sources = results['ids'][0]
    
    # Formatierte Ausgabe
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    print(formatted_response)
    return response_text
    # Hauptprogramm
    
if __name__ == "__main__":
    process_all_folders()

Processing folders:   0%|          | 0/105 [00:00<?, ?folder/s]INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Splitting documents into smaller chunks...
Number of chunks: 2262
Initializing embedding model...
Connecting to Chroma database...
Creating embeddings...


Processing folders:   0%|          | 0/105 [39:36<?, ?folder/s]


KeyboardInterrupt: 