# Environment

(1) Packages: pip install -r requirements.txt

(2) LangSmith:
https://docs.smith.langchain.com/

In [60]:
import os
from dotenv import load_dotenv
load_dotenv()

LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
NOMIC_API_KEY = os.getenv("NOMIC_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "My-RAG-Project"
os.environ["NOMIC_API_KEY"] = NOMIC_API_KEY



In [72]:
import streamlit as st
import bs4
import re
from langchain import hub
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_nomic.embeddings import NomicEmbeddings

## Part 1: Overview

### Web Scraping | Loading
- web_path: url of website  to be scraped
- bs_kwargs: keyword arguments that go in beautiful_soup
- SoupStrainer: filters specific portion of the HTML file to be scraped
- portion to be scraped is diff for diff websites --> go to inspect on a website, check the name of classes under which req content is defined, those classes would go in "class_"


In [73]:
loader = WebBaseLoader(
    web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content","post-title","post-header")
        )
    )

)

docs = loader.load()

### Splitting
- chunk_size is not number of words in a chunk, it is number of characters instead

In [74]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap = 200
)

splits = text_splitter.split_documents(docs)

### Embeddings

In [64]:
vectorstores = Chroma.from_documents(documents=splits,
                                     embedding=NomicEmbeddings(model="nomic-embed-text-v1.5")
                                     )

In [65]:
retriever = vectorstores.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatGroq(
    model_name="llama3-8b-8192"
)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# chain

rag_chain = (
    {"context":retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
)

rag_chain.invoke("What is task decomposition?")

'Task decomposition is the process of breaking down a complex task into smaller and simpler steps, allowing a model to "think step by step" and utilize more test-time computation. This technique, known as Chain of Thought (CoT), enhances model performance on complex tasks by transforming big tasks into multiple manageable tasks. It also provides insight into the model\'s thinking process.'

### Retrieval

In [66]:
doc = retriever.get_relevant_documents("what is task decomposition?")
len(doc)

4