# 1 - Imports básicos

In [23]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

# 2 - Carregando dados do Wikipedia

In [24]:
loader = WebBaseLoader(
    "https://en.wikipedia.org/wiki/Silvio_Santos"
)
docs = loader.load()
print(f"Número de documentos {len(docs)}")

Número de documentos 1


In [3]:
documento = docs[0]
print(f"Documento: \n\n {documento}")

Documento: 

 page_content='



Silvio Santos - Wikipedia




































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search















Donate








Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Early life








2
Broadcasting career




Toggle Broadcasting career subsection





2.1
Baú da Felicidade, television work








2.2
TVS and SBT








2.3
Further projects








2.4
Departure from television work










3
Legacy








4
Death








5
See also








6
References








7
Further reading








8
External link

In [4]:
print(f"Tipo do objeto documento: {type(documento)}")

document_as_dict = vars(documento)
print(f"Propriedades do documento: {document_as_dict.keys()}")

Tipo do objeto documento: <class 'langchain_core.documents.base.Document'>
Propriedades do documento: dict_keys(['id', 'metadata', 'page_content', 'type'])


# 3 - Dividindo dados da página em chunks

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

print(f"Tipo do objeto chunks (acho que é uma lista): {type(chunks)}")
print(f"Número de chunks: {len(chunks)}")

Tipo do objeto chunks (acho que é uma lista): <class 'list'>
Número de chunks: 70


In [37]:
print("Vamos dar uma olhada nos chunks... \n\n")

for i in range(20, 25):
    print(f"Chunk #{i}: \n{chunks[i]}\n-------------------------------\n")
# print(chunks[40:45])

Vamos dar uma olhada nos chunks... 


Chunk #20: 
page_content='In January 1988, Santos traveled to Boston to undergo medical treatment for vocal cord issues that had caused him to lose his voice, and a tumor on one of his eyelids, which was found to be benign. As his spouse Maria Aparecida Vieira had died from cancer in 1977, the 57 year-old Santos had introspections between himself and his colleagues regarding his health and future, leading him to begin the process of naming a successor. During the Rio Carnival the following month, Santos personally intervened with Globo's president Roberto Marinho to keep Gugu Liberato—an SBT personality who had been introduced to television by Santos, and was approached by Globo to host a Sunday variety show—from defecting to the competitor. Santos would give Gugu prominent roles in SBT's Sunday lineup, and a major pay raise.[31][32][33][34][35]' metadata={'source': 'https://en.wikipedia.org/wiki/Silvio_Santos', 'title': 'Silvio Santos - Wikipedia'

# 4 - Criando embeddings para os chunks de texto

Aqui associamos cada pedaço de texto a um vetor multidimensional e criamos uma base de dados semântica, de onde podemos extrair trechos semanticamente relacionados por similaridade de cossenos.

In [12]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")
Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./data",
)

<langchain_community.vectorstores.chroma.Chroma at 0x7c7c3537a410>

# 5 - Criando cadeia do LangChain para executar o RAG

In [14]:
vectorstore = Chroma(persist_directory="./data", embedding_function=embeddings)
retriever = vectorstore.as_retriever()

retriever

  vectorstore = Chroma(persist_directory="./data", embedding_function=embeddings)


VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c7c1eb8cc90>, search_kwargs={})

In [15]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a helpful assistant that answers the user's questions only based on the context below:

            <context>
                {context}
            </context>
            """,
        ),
        ("human", "{question}"),
    ]
)

prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a helpful assistant that answers the user's questions only based on the context below:\n\n            <context>\n                {context}\n            </context>\n            "), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='{question}'), additional_kwargs={})])

In [16]:
llm = Ollama(
        model="llama3.2",
        temperature=0.1
    )

llm

Ollama(model='llama3.2', temperature=0.1)

In [17]:
parser = StrOutputParser()
parser

StrOutputParser()

In [25]:
# retriever.input_schema.schema()

# prompt.input_schema.schema()

chain = (
    {
        'question': RunnablePassthrough(),
        'context': retriever
    } |  
    prompt |
    llm | 
    parser
)

chain

{
  question: RunnablePassthrough(),
  context: VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c7c1eb8cc90>, search_kwargs={})
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a helpful assistant that answers the user's questions only based on the context below:\n\n            <context>\n                {context}\n            </context>\n            "), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='{question}'), additional_kwargs={})])
| Ollama(model='llama3.2', temperature=0.1)
| StrOutputParser()

# 5 - Respondendo a uma pergunta

In [27]:
stream = chain.stream("What happened to Silvio Santos?")

for chunk in stream:
    print(chunk, end="")

print()

Silvio Santos died on August 17, 2024, due to bronchopneumonia, which he had been hospitalized for previously with H1N1. He was 93 years old at the time of his death.


# 6 - Debugando a cadeia

In [34]:
llm_chain = prompt | llm | parser

def debug_retriever(docs):
    for i, doc in enumerate(docs):
        print(f"Retrieved doc #{i+1}:")
        print(doc)

    print("\n\n--------------End of context------------\n\n")

    return docs

debug_chain = (
    {
        'question': RunnablePassthrough(),
        'context': retriever | debug_retriever
    } |  
    llm_chain
)

debug_chain

{
  question: RunnablePassthrough(),
  context: VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c7c1eb8cc90>, search_kwargs={})
           | RunnableLambda(debug_retriever)
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are a helpful assistant that answers the user's questions only based on the context below:\n\n            <context>\n                {context}\n            </context>\n            "), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='{question}'), additional_kwargs={})])
| Ollama(model='llama3.2', temperature=0.1)
| StrOutputParser()

In [35]:
stream = debug_chain.stream("What happened to Silvio Santos?")

for chunk in stream:
    print(chunk, end="")

print()

Retrieved doc #1:
page_content='Retrieved from "https://en.wikipedia.org/w/index.php?title=Silvio_Santos&oldid=1247243424"' metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Silvio_Santos', 'title': 'Silvio Santos - Wikipedia'}
Retrieved doc #2:
page_content='^ Oliveira, Fábia (17 August 2024). "Morre Silvio Santos; apresentador estava internado há 17 dias" [Silvio Santos dies; the presenter had been hospitalized for 17 days]. Metrópoles (in Brazilian Portuguese). Archived from the original on 17 August 2024. Retrieved 17 August 2024.' metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Silvio_Santos', 'title': 'Silvio Santos - Wikipedia'}
Retrieved doc #3:
page_content='Legacy[edit]
Santos in December 2020
Due to his extremely charismatic personality, Santos was one of the most influential and beloved people in Brazil, and was considered to be a "timeless" figure of Brazilian entertainment.[57] In an obituary after his death, Argentine newspaper Cla