In [44]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore


In [None]:
import os
os.environ["OPENAI_API_KEY"] = token

In [31]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model_name = "gpt-3.5-turbo", max_tokens=500)

In [32]:
pdf_link = "DOC-SF238339076816-20230503.pdf"

loader = PyPDFLoader(pdf_link, extract_images=False)

pages = loader.load_and_split()

In [33]:
len(pages)

31

In [34]:
#Splitter

child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000,
                                                 chunk_overlap=200,
                                                 length_function=len,
                                                 add_start_index=True)

In [35]:
#Storages

store = InMemoryStore()
vectorstore = Chroma(embedding_function=embeddings, persist_directory="childVectorDB")

In [36]:
parent_document_retriever = ParentDocumentRetriever(
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    vectorstore=vectorstore,
    docstore=store,
)

parent_document_retriever.add_documents(pages, ids=None)

In [37]:
parent_document_retriever.vectorstore.get()

{'ids': ['f108b50b-d986-4805-aa18-5c491fb91935',
  'fa2c4f4c-a648-4dc8-ba2a-df3a7e6c32e3',
  '2c4c1fe1-b932-4f4b-9d81-7cca00cd6567',
  '9a12e259-7b65-4a09-b8a5-b79269e73d3e',
  '2e9e0734-62e1-4f82-a2ad-fb1fa34603ed',
  '63cd881d-fb70-4501-8a20-ad346adf853f',
  'abeb0df6-5e65-494f-a57d-4bc5a3d478d7',
  '52866a23-3869-4b17-9e69-38a404802679',
  'c950028b-95ff-45bc-ab41-262e7b146ca9',
  '3bcf5343-6760-4c9d-a6ea-857b705c7daa',
  '339f5e01-0751-47a1-83fa-dd961aca6978',
  '25fa6e72-02a9-4a36-becb-703b7c0ae640',
  '820c925d-8cb1-48be-81db-d9d50da9f595',
  '5440840c-4500-4127-a42e-a1ddbe5a75c7',
  'aca86cb3-098d-4e52-b82f-28c93d6e9830',
  '6206c0c8-c49e-42a7-acf2-87edf166cab6',
  '3ce85e50-9c43-432f-bbdf-b27dbffaeff3',
  '22ddec32-de7f-4e1e-8abc-be5a3420fb05',
  '84ee99af-6da1-4cec-bbd0-287870604217',
  '7bc7ecdb-91c5-49a3-a5e7-d08a5bee707b',
  '4de61a23-e3d3-41cb-987e-902135a0b5bb',
  '66757292-3f44-479c-bcf5-a54d92cf5a5b',
  '95bdb499-ae3a-43a5-8e71-d16afc3f590a',
  '240ea414-cd8f-4f57-acb3-

In [None]:
TEMPLATE = """
You are an expert in legislation and technology. Answer the question below using the provided context.

Query:
{question}

Context:
{context}
"""
rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [45]:
setup_retriever = RunnableParallel({"question" : RunnablePassthrough(), "context": parent_document_retriever})

output_parser = StrOutputParser()

In [46]:
parent_chain_retrievel = setup_retriever | rag_prompt | llm | output_parser

In [None]:
parent_chain_retrievel.invoke(
    "What are the main risks of the AI legal framework?"
)


'Os principais riscos do marco legal de IA podem estar relacionados à dificuldade de implementação e fiscalização das normas estabelecidas, à falta de harmonização entre a proteção de direitos fundamentais e a inovação tecnológica, bem como à necessidade de calibrar a regulação de acordo com os potenciais riscos do contexto de aplicação da tecnologia. Além disso, a garantia de acesso à informação, a adequada compreensão das decisões tomadas por sistemas de IA, o direito de contestar decisões automatizadas e solicitar intervenção humana, assim como a não discriminação e a correção de vieses discriminatórios, são aspectos que podem gerar desafios na implementação e cumprimento do marco legal de IA.'