## Chunking & RAG with reduced Data Set 

In [4]:
# data handling & viz
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# language preprocessing
import re #regex
from wordcloud import WordCloud
import spacy # DE stopwords

# langchain packages
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy

# environment variables
load_dotenv()
import warnings
warnings.filterwarnings('ignore')


In [5]:
# cleaned and reduced data set
df_exp_debates = pd.read_csv("data/debates_2017_2021.csv")

In [6]:
df_exp_debates.shape

(26902, 12)

In [None]:
# get 'text' from df_exp_debates to be used by chunking function 

In [None]:
# chunking function
# todo: hyperparameters = chunk_size and chunk_overlap
def chunk_documents(documents, chunk_size=200, chunk_overlap=50):
    """
    Splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    
    # add id to each chunk to map it later 
    for i, chunk in enumerate(chunks):
         chunk.metadata.update({
        "id": f"chunk_{i}",
    })
    
    return chunks

In [None]:
chunks = chunk_documents(docs)

In [None]:
# Execute your chunking function and display results here
print(f"number of chunks created: {len(chunks)}","\n",f"Type of the chunks : {type(chunks)}","\n\n" ,chunks)

In [None]:
# Example function for embeddings and storage
def embed_and_store(chunks,db_name):
    
     # instantiate embedding model
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2',
        encode_kwargs={"normalize_embeddings": True}
    )
    
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding,
        distance_strategy=DistanceStrategy.COSINE  # or DistanceStrategy.DOT or DistanceStrategy.L2 
    )
    
     # save VectorStore locally
    vectorstore.save_local(f"../vector_databases/vector_db_{db_name}")

    return vectorstore