# Naive RAG example


To run this example you need to have a postgres with pgvector running.
There is a docker-compose file in the root directory, so you can simply do:

```bash
docker compose up -d 
```

In [None]:
# install required dependencies
! pip install --upgrade pip
#! pip install langchain_community tiktoken langchain-openai chromadb langchain unstructured "unstructured[pdf]" langchain_ollama langchain_postgres "psycopg2[binary]"
! pip install langchain
! pip install langchain_community 
! pip install langchain-openai 
#! pip install chromadb
! pip install unstructured "unstructured[pdf]"
#! pip install langchain_ollama 
! pip install "psycopg[binary]"
! pip install langchain_postgres 
! pip install pydantic

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()


## Getting source documents

In [None]:

!rm -rf 'documents'
!mkdir -p 'documents'
!curl -L 'https://itau-fn8-fundosdocumentos.cloud.itau.com.br/52678_COMPE.pdf' -o 'documents/52678_COMPE.pdf'
!curl -L 'https://itau-fn8-fundosdocumentos.cloud.itau.com.br/55765_COMAG.pdf' -o 'documents/55765_COMAG.pdf'
 

## Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("./documents", glob="**/*.pdf")
docs = loader.load()
len(docs)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
splits

## Add metada for filtering

In [None]:
# adding metadata
for doc in splits:
    if doc.metadata['source'] == 'documents/55765_COMAG.pdf':
        doc.metadata['fundo'] = "ITAU_FIC_FIM"
    if doc.metadata['source'] == "documents/52678_COMPE.pdf":
        doc.metadata['fundo'] = "DIFERENCIADO_CREDITO_PRIVADO_LONGO_PRAZO_RENDA_FIXA"

splits

## Create database (postgres)

In [None]:

import psycopg

vector_db_name = "vector_db_rag"
admin_db_name = "postgres"
db_host = "localhost"
db_user = "postgres"
db_password = "postgres"
db_port = "5432"

#connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain

connection_string = f"postgresql+psycopg://{db_user}:{db_password}@{db_host}:{db_port}/{vector_db_name}"

conn = psycopg.connect(dbname=admin_db_name, host=db_host, 
                       port=db_port, user=db_user, password=db_password)
conn.autocommit = True

with conn.cursor() as c:
    kill_connection_query  = f"""
                                SELECT 
                                    pg_terminate_backend(pid) 
                                FROM 
                                    pg_stat_activity 
                                WHERE 
                                    -- don't kill my own connection!
                                    pid <> pg_backend_pid()
                                    -- don't kill the connections to other databases
                                    AND datname = '{vector_db_name}'
                                    ;
                            """
    
    c.execute(kill_connection_query)

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {vector_db_name}")
    c.execute(f"CREATE DATABASE {vector_db_name}")

## Generating and storing embedding in Vector Database

In [None]:
# vector store
from langchain_community.vectorstores import Chroma
from langchain_postgres import PGVector
#from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
#from sqlalchemy import make_url    

#embedding_model = OllamaEmbeddings(model="all-minilm")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Embed
# vectorstore = Chroma.from_documents(documents=splits, 
#                                     embedding=embedding_model)

vector_store = PGVector(
    embeddings=embedding_model,
    collection_name="fundos_investimento",
    connection=connection_string,
    use_jsonb=True,
)


vector_store.add_documents(documents=splits)

retriever = vector_store.as_retriever()

In [None]:
vector_store

In [None]:
# activate debug logging
from langchain_core.globals import set_debug, set_verbose
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

set_debug(True)
#set_verbose(False)

## Answering questions using RAG

In [None]:
# query the vector store
#from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# Prompt

## filter by metadata
retriever = vector_store.as_retriever(search_kwargs={'filter': {'fundo':'ITAU_FIC_FIM'}})

# LLM
#llm = ChatOllama(model="llama3.1")
llm = ChatOpenAI(model="gpt-4.1")

system_prompt = """
     You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
     If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
     Question: {question} 
     Context: {context} 
     Answer:
     """

prompt_template = ChatPromptTemplate([("system", system_prompt)])

# Chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

# Question
print(rag_chain.invoke("Qual o horário limite investir no fundo ?"))

---------------
### Bonus -> structured output chalenge

In [None]:
## bonus - enrich query with a structured output
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI

class SearchQuery(BaseModel):
    """Search to execute againts a vector database."""

    search: str = Field(description="The user search")
    fundo_investimento: str = Field(description="The investment fund name, " \
    "there are only two ITAU_FIC_FIM or DIFERENCIADO_CREDITO_PRIVADO_LONGO_PRAZO_RENDA_FIXA ")


llm_structured_output = ChatOpenAI(model="gpt-4o-mini")

structured_llm = llm_structured_output.with_structured_output(SearchQuery)

response = structured_llm.invoke("Please, convert the user query: Qual é o hórário limite do fundo fic fim ? ")
response