# RAG PIPELINE

In [23]:
import getpass
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your token: ")

from langchain_huggingface import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/kunal/.cache/huggingface/token
Login successful


### Load blog

In [52]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://www.clearsignal.xyz/cpu-vs-gpu-vs-tpu/",),)
    # bs_kwargs=dict(
    #     parse_only=bs4.SoupStrainer(
    #         class_=("post-content", "post-title", "post-header")
    #     )
    # ),
# )
docs = loader.load()

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device":'cpu'}
encode_kwargs = {'normalize_embeddings':True}

hf = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)


### Setup vector Store

In [54]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=hf)

### Setup Retreiver

In [55]:
from loguru import logger

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Example
retrieved_docs = retriever.invoke("What are the approaches to Task Decomposition?")
logger.info(f"Number of retrieved docs: {len(retrieved_docs)}")
logger.info(f"Content of first doc: {retrieved_docs[0].page_content}")


[32m2024-09-24 15:09:46.683[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mNumber of retrieved docs: 6[0m
[32m2024-09-24 15:09:46.684[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mContent of first doc: Fig. 1. Overview of a LLM-powered autonomous agent system.
Component One: Planning#
A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
Task Decomposition#
Chain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.[0m


### Setup Prompt and chain

In [56]:
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



### Invoke Prompt

In [58]:
rag_chain.invoke("How does a GPU work?")

' A GPU (graphics processing unit) works by processing a large number of computations simultaneously, specifically designed for tasks such as rendering graphics and performing matrix operations. It is optimized for throughput over latency by running a large number of ALUs (arithmetic logic units) in parallel. This design allows GPUs to excel at tasks such as training deep learning models due to the high computational demands required. TPUs (tensor processing units) are a specialized type of processor designed specifically for machine learning and deep learning tasks. They are heavily optimized for ONLY these types of operations, compromising on the flexibility needed to perform other tasks. While CPUs (central processing units) can also perform these tasks, they are not optimized specifically for them, and their design requires them to execute instructions sequentially, making them less efficient for these tasks. In summary, CPUs are better suited for general-purpose computing, while G