### Libraries


In [29]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from pathlib import Path

### Split the doucment into Chunks & Store them in Vector Store

In [30]:
def enrich_chunks_with_metadata(chunks):
    enriched = []
    for chunk in chunks:
        meta = chunk.metadata.copy()
        source_path = Path(meta.get("source", ""))
        filename = source_path.name
        page_number = meta.get("page", -1)
        
        chunk.metadata.update({
            "filename": filename,
            "page_number": page_number
        })
        enriched.append(chunk)
    return enriched

In [31]:
def ingest():
    loader = DirectoryLoader(
        path="electoral_programs",
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    pages = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""],
    )
    
    chunks = text_splitter.split_documents(pages)
    enriched_chunks = enrich_chunks_with_metadata(chunks)

    print(f"Split {len(pages)} documents into {len(enriched_chunks)} chunks.")
    
    embedding = FastEmbedEmbeddings()
    Chroma.from_documents(
        documents=enriched_chunks,
        embedding=embedding,
        persist_directory="./sql_chroma_db"
    )

In [32]:
# only run this once to generate vector store
ingest()

100%|██████████| 9/9 [00:47<00:00,  5.32s/it]


Split 1817 documents into 5081 chunks.


### Create a RAG chain that retreives relevent chunks and prepares a response

In [33]:
def rag_chain():
    model = ChatOllama(model="llama3")
    #
    prompt = PromptTemplate.from_template(
        """
        You are a helpful assistant. Use only the information in the context below to answer the question.

        If the context does not contain enough information, say:
        "No relevant context available to answer this question."

        Include the source (filename and page number) if relevant.

        Question: {input}

        Context:
        {context}

        Answer:
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 5,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    #
    return chain

In [34]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])
        print(f"→ From {doc.metadata['filename']} - Page {doc.metadata['page_number']}")
        print(doc.page_content[:200])

In [35]:
ask("Propostas para a saude do PS")

Propostas para a saúde do PS (Partido Socialista)?

Sim, há propostas para a saúde do PS presentes no contexto. Algumas dessas propostas incluem:

* Reforço do acesso a produtos de recolha menstrual através da sua distribuição gratuita em centros de saúde, escolas e outras instituições.
* Priorização da saúde preventiva, com promoção de hábitos de alimentação saudável e atividade física.
* Fortalecimento do papel das unidades de saúde, dotando-as com meios de diagnósticos e acompanhamento.

Essas propostas são apresentadas no contexto como parte da visão "Uma Só Saúde" do PS.
Source:  electoral_programs\Programa-Eleitoral-PAN_Legislativas25.pdf


KeyError: 'filename'