## Chunking & RAG with reduced Data Set 

In [28]:
# data handling & viz
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# language preprocessing
import re #regex
from wordcloud import WordCloud
import spacy # DE stopwords

# langchain packages for RAG / LLM
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
#from langchain_core.output_parsers import StrOutputParser appears to be buggy
from langchain_groq import ChatGroq
from langchain_classic import hub
prompt = hub.pull('rlm/rag-prompt')

# environment variables
load_dotenv()
import warnings
warnings.filterwarnings('ignore')

# Instantiate ChatGroq with llama
llm = ChatGroq(
    model= "llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)


In [2]:
# cleaned and reduced data set
df_exp_debates = pd.read_csv("data/debates_sample_5k.csv")

In [3]:
df_exp_debates.shape

(5000, 12)

In [5]:
# chunking function
def chunk_documents(documents, chunk_size=200, chunk_overlap=50):
    """
    Splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    
    # add id to each chunk to map it later 
    for i, chunk in enumerate(chunks):
         chunk.metadata.update({
        "id": f"chunk_{i}",
    })
    
    return chunks

In [6]:
# convert df rows to Document objects, preserving metadata
documents = [
    Document(
        page_content=row['text'],
        metadata={
            'row_index': i,
            'speaker_name': row['speech_identification_ent'], # check if syntax works
            'date': row['date'],
            'legislative period': row['period']
        }  
    )
    for i, row in df_exp_debates.iterrows()
]



In [7]:
# chunking
chunks = chunk_documents(documents, chunk_size=500, chunk_overlap=50)

In [8]:
# display results of chunking function
print(f"number of chunks created: {len(chunks)}")

number of chunks created: 36946


In [9]:
# instantiate embedding model (outside of function)
embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'mps'}, # mps acceleration for M1 chip
        encode_kwargs={"normalize_embeddings": True}
    )

# function for embeddings and storage
def embed_and_store(chunks,db_name,embedding):
    
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding,
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
    )
    
     # save VectorStore locally
    vectorstore.save_local(f"vector_databases/vector_db_{db_name}")

    return vectorstore

In [10]:
all_embedding=embed_and_store(chunks=chunks, db_name="speeches_sample_5k",embedding=embedding)

In [11]:
# Implement retrieval logic from your FAISS database

# instantiate embedding model
embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'mps'}, # mps acceleration for M1 chip
        encode_kwargs={"normalize_embeddings": True}
    )

def retrieve_from_vector_db(vector_db_path):
    """
    this function splits out a retriever object from a local VectorStore
    """
    
    vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True,
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
    )
    retriever = vectorstore.as_retriever(
        search_kwargs={'k':3} # k nearest neighbors
    )
    
    return retriever,vectorstore

# Load the retriever and index
retriever,vectorstore = retrieve_from_vector_db("vector_databases/vector_db_speeches_sample_5k")
type(retriever),type(vectorstore)

(langchain_core.vectorstores.base.VectorStoreRetriever,
 langchain_community.vectorstores.faiss.FAISS)

In [None]:
retriever.vectorstore.docstore._dict

In [29]:
def connect_chains(retriever):
    """
    Connects retriever, prompt, llm and output parser into a RAG chain (LCEL)
    """
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        #| StrOutputParser()
        | RunnableLambda(lambda msg: msg.content) # avoid bug with StrOutParser() returning TextAccessor
    )
    return rag_chain

In [30]:
rag_chain = connect_chains(retriever)

In [None]:
# Define your interactive chat querying function

def chat_with_rag(chain):
    """
    Interactive function to chat with the RAG system.
    """
    print("Willkommen im ChatBundestag üèõÔ∏è. Gib eine Frage zu den Bundestagsdebatten ein. \nSchreibe 'exit', um den Chat zu beenden.\n")
    while True:
        user_input = input("Du: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Chat wird beendet. Auf Wiedersehen!")
            break
        try:
            result = chain.invoke(user_input)
            print(f"ChatBundestag Antwort: {result}\n")
        except Exception as e:
            print(f" Fehler: {e}\n")

# Run your interactive chat
chat_with_rag(rag_chain)