# Usando LangChain e RAG com DeepSeek R1

## instalação de pré-requisitos



In [None]:
print("Instalando ChromaDB")
!pip install chromadb | tail -1

print("Instalando Sentence Transformers")
!pip install sentence-transformers | tail -1

print("Instalando pypdf")
!pip install pypdf | tail -1

print("Instalando langchain")
!pip install langchain | tail -1

print("Instalando langchain-community")
!pip install langchain_community | tail -1

print("Instalando huggingface_hub")
!pip install huggingface_hub | tail -1

print("Instalando langchain-huggingface")
!pip install langchain_huggingface

print("Instalando transformers")
!pip install transformers | tail -1

Instalando ChromaDB
Instalando Sentence Transformers
Successfully installed nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127
Instalando pypdf
Successfully installed pypdf-5.3.0
Instalando langchain
Instalando langchain-community
Successfully installed dataclasses-json-0.6.7 httpx-sse-0.4.0 langchain_community-0.3.18 marshmallow-3.26.1 mypy-extensions-1.0.0 pydantic-settings-2.8.0 typing-inspect-0.9.0
Instalando huggingface_hub
Instalando transformers


In [None]:
from pypdf import PdfReader
import requests
import os
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

## carregamento de dados

The Little Prince (pdf)

In [None]:
def download_file(url, filename):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"File '{filename}' downloaded successfully.")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")

# Example usage:
file_url = "https://blogs.ubc.ca/edcp508/files/2016/02/TheLittlePrince.pdf"
file_name = "TheLittlePrince.pdf"

if not os.path.exists(file_name):
  download_file(file_url, file_name)
else:
  print(f"File '{file_name}' already exists.")

File 'TheLittlePrince.pdf' downloaded successfully.


## processar o pdf como texto

In [None]:
def process_pdf(file_path):
    reader = PdfReader(file_path)
    chunks = []
    file_name = file_path.split('/')[-1]  # Extrai o nome do arquivo

    for page_num in range(len(reader.pages)):
      page = reader.pages[page_num]
      text = page.extract_text()

      # Processamento de chunks
      chunk_size = 200  # Define o tamanho desejado dos chunks
      start_idx = 0
      end_idx = chunk_size

      chunk_id_counter = 0

      while start_idx < len(text):
          chunk = text[start_idx:end_idx]
          chunks.append({
              'id': f"{file_name}_page_{page_num + 1}_chunk_{chunk_id_counter}",
              'text': chunk,
              'metadata': {
                  'file_name': file_name,
                  'page': page_num + 1,
              },
          })
          start_idx = end_idx
          end_idx = min(end_idx + chunk_size, len(text))
          chunk_id_counter += 1

    return chunks

In [None]:
# model_name = 'all-mpnet-base-v2'
# embedder = SentenceTransformer(model_name)

from langchain.embeddings import HuggingFaceEmbeddings
embedder = HuggingFaceEmbeddings(
    # api_key="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",
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Process the PDF and get chunks
chunks = process_pdf(file_name)



In [None]:
# Initialize ChromaDB client
persist_directory = "/content/chromadb"
client = chromadb.PersistentClient(path=persist_directory)

from chromadb import Documents, EmbeddingFunction, Embeddings
class CustomEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedder):
        self.embedder = embedder

    def __call__(self, input: Documents) -> Embeddings:
        return self.embedder.embed_documents(input)

# Instantiate the custom embedding function
embedding_function = CustomEmbeddingFunction(embedder=embedder)

collection_name = "little_prince"
collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

# Add the chunks to the collection
collection.add(
    documents=[chunk["text"] for chunk in chunks],
    metadatas=[chunk["metadata"] for chunk in chunks],
    ids=[chunk["id"] for chunk in chunks]
)

print(f"Chunks and metadata saved to collection '{collection_name}'.")




Chunks and metadata saved to collection 'little_prince'.


In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter

In [None]:
# Initialize ChromaDB client (assuming you've already created the collection)
persist_directory = "/content/chromadb"
client = chromadb.PersistentClient(path=persist_directory)
collection_name = "little_prince"
collection = client.get_collection(name=collection_name)

# Create a vectorstore from the Chroma collection
vectorstore = Chroma(collection_name=collection_name,
                     embedding_function=embedder,
                     persist_directory=persist_directory, client=client)

from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    task='text-generation',
    model="deepseek-ai/DeepSeek-R1",
    max_new_tokens=100,
    temperature=0.5,
    huggingfacehub_api_token="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" # CHANGE TO A VALID FREE API KEY
)

import warnings
warnings.filterwarnings("ignore")

query = "Who is the prince?"

print("DIRECT (((DeepSeek R1))) APPLIED:\nQuestion: {query}\nAnswer: {result}".format(query=query, result=llm.invoke(query)) )

# Initialize the QA chain
qa_retriever = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

print("\n\n-------------------")

result = qa_retriever.run(query)
print("LangChain/RAG APPLIED OVER (((DeepSeek R1))):\nQuestion: {query}\nAnswer: {result}".format(query=query, result=result))


DIRECT (((DeepSeek R1))) APPLIED:
Question: Who is the prince?
Answer:  The prince is a character in a fairy tale called "The Princess and the Pea." The prince is searching for a true princess, and he believes he has found her in a young woman who can feel the pea under twenty mattresses. However, it is later revealed that the young woman is not a princess, but a clever and resourceful commoner named Ella. The prince falls in love with Ella and they live happily ever after.


-------------------
LangChain/RAG APPLIED OVER (((DeepSeek R1))):
Question: Who is the prince?
Answer:  The prince is the little prince.
