# Retrieval Augmented Generation Demo

## Set up environment

In [None]:
import warnings
from dotenv import load_dotenv

warnings.filterwarnings('ignore')
load_dotenv()

## Set up tracing
Define `LANGCHAIN_API_KEY` in your `.env` file.  
This step is optional.

In [None]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

## Set up LLM
Define `GROQ_API_KEY` in your `.env` file.

In [None]:
# import getpass
# import os

# os.environ["GROQ_API_KEY"] = getpass.getpass()
import sys
print(sys.path)
print(sys.version)

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

## Indexing: Load

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

len(docs[0].page_content)

## Indexing: Split

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

## Indexing: Store

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())