## Chunking & RAG with reduced Data Set 

In [54]:
import os

# data handling & viz
import pandas as pd
import matplotlib.pyplot as plt

# language preprocessing
import re #regex
from wordcloud import WordCloud
import spacy # DE stopwords

# langchain packages for RAG / LLM
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
#from langchain_core.output_parsers import StrOutputParser appears to be buggy
from langchain_groq import ChatGroq
from langchain_classic import hub # alternative to from langchain import hub, because this gave errors
prompt = hub.pull('rlm/rag-prompt') #hard-wired prompt. OK for V0

# environment variables
from dotenv import load_dotenv
load_dotenv()
import warnings
warnings.filterwarnings('ignore')

# instantiate ChatGroq with llama
llm = ChatGroq(
    model= "llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)


# refs for vector db handling
vector_db_name = "speeches_sample_5k"
vector_db_path = f"vector_databases/vector_db_{vector_db_name}"


In [2]:
# cleaned and reduced data set
df_exp_debates = pd.read_csv("data/debates_sample_5k.csv")

In [3]:
df_exp_debates.shape

(5000, 12)

In [5]:
# chunking function
def chunk_documents(documents, chunk_size=200, chunk_overlap=50):
    """
    Splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    
    # add id to each chunk to map it later 
    for i, chunk in enumerate(chunks):
         chunk.metadata.update({
        "id": f"chunk_{i}",
    })
    
    return chunks

In [6]:
# convert df rows to Document objects, preserving metadata
documents = [
    Document(
        page_content=row['text'],
        metadata={
            'row_index': i,
            'speaker_name': row['speech_identification_ent'], # check if syntax works
            'date': row['date'],
            'legislative period': row['period']
        }  
    )
    for i, row in df_exp_debates.iterrows()
]



In [7]:
# chunking
chunks = chunk_documents(documents, chunk_size=500, chunk_overlap=50)

In [8]:
# display results of chunking function
print(f"number of chunks created: {len(chunks)}")

number of chunks created: 36946


In [55]:
# instantiate embedding model 
embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'mps'}, # mps acceleration for M1 chip
        encode_kwargs={"normalize_embeddings": True}
    )

In [56]:
# function to create and save vector store 
def create_and_store(chunks,vector_db_path,embedding):
    """
    this function creates a vector store from chunks and saves it locally
    """
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding, # parameter name is singular!
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
    )
    
     # save vector store locally
    vectorstore.save_local(f"vector_databases/vector_db_{db_name}")

    return vectorstore

In [57]:
# implement retrieval from FAISS db

def retrieve_from_vector_db(vector_db_path,embedding):
    """
    this function splits out a retriever object from a local vector store
    """
    
    vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embedding, # parameter name is plural!
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
    )
    retriever = vectorstore.as_retriever(
        search_kwargs={'k':10} # k nearest 
    )
    
    return retriever,vectorstore


In [58]:
# check if vector store exists. if no: creates vector store
if not os.path.exists(vector_db_path):
        print("Vector DB not found. Creating and embedding chunks.")
        all_embedding=create_and_store(chunks=chunks, vector_db_path=vector_db_path, embedding=embedding)
        print(f"Vector DB save to {vector_db_papth}")
else:
    print(f"Vector DB found at {vector_db_path}. Skipping embedding.")

Vector DB found at vector_databases/vector_db_speeches_sample_5k. Skipping embedding.


In [59]:
# load the retriever and index
retriever,vectorstore = retrieve_from_vector_db("vector_databases/vector_db_speeches_sample_5k", embedding=embedding)

#type(retriever),type(vectorstore)

In [60]:
def connect_chains(retriever):
    """
    Connects retriever, prompt, llm and output parser into a RAG chain (LCEL)
    """
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        #| StrOutputParser()
        | RunnableLambda(lambda msg: msg.content) # avoid bug with StrOutParser() returning TextAccessor
    )
    return rag_chain

In [61]:
rag_chain = connect_chains(retriever)

In [62]:
# Define your interactive chat querying function

def chat_with_rag(chain):
    """
    Interactive function to chat with the RAG system.
    """
    print("Willkommen im ChatBundestag üèõÔ∏è. Gib eine Frage zu den Bundestagsdebatten ein. \nSchreibe 'exit', um den Chat zu beenden.\n")
    while True:
        user_input = input("Du: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Chat wird beendet. Auf Wiedersehen!")
            break
        try:
            result = chain.invoke(user_input)
            print(f"ChatBundestag Antwort: {result}\n")
        except Exception as e:
            print(f" Fehler: {e}\n")

# Run your interactive chat
chat_with_rag(rag_chain)

Willkommen im ChatBundestag üèõÔ∏è. Gib eine Frage zu den Bundestagsdebatten ein. 
Schreibe 'exit', um den Chat zu beenden.



Du:  Welche Legislaturperiode ist die neueste, auf die du Zugriff hast?


ChatBundestag Antwort: Die neueste Legislaturperiode, auf die ich Zugriff habe, ist die 18. Legislaturperiode. Sie begann am 24. Oktober 2017 und endete am 26. Oktober 2021.



Du:  Bist Du sicher, dass es nicht die 19. ist?


ChatBundestag Antwort: Ich bin mir nicht sicher, ob es die 19. ist, da es keine klare Best√§tigung in den Dokumenten gibt. Es gibt jedoch ein Dokument, das sich auf das Jahr 2019 bezieht.



Du:  Wieviele Menschen mit dem Vornamen Thomas gibt es in deinem Datensatz?


ChatBundestag Antwort: Ich kann keine genauen Informationen √ºber die Anzahl der Personen mit dem Vornamen Thomas in meinem Datensatz finden.



Du:  Nenne mir eine Person mit dem Vornamen Thomas


ChatBundestag Antwort: Ich kann keine Person mit dem Vornamen Thomas in diesem Kontext finden.



Du:  Nenne mir eine Person mit dem Vornamen Angela.


ChatBundestag Antwort: Eine Person mit dem Vornamen Angela ist Astrid Damerow.



KeyboardInterrupt: Interrupted by user