## Let's wrap it up
In this hand-ons, we will join all the concepts we've learned so far in order to create a Q&A agent.

Let's create an Q&A pipeline using the folowing steps:
1. Load the data
2. Preprocess the data: chunk the data into smaller parts
3. Prepare knowledge base: embed the data and store in a vector database
4. Prepare Q&A chain with three steps:
   1. Prepare the question
   2. Retrieve the most similar documents
   3. Answer the question based on the retrieved documents

## Load the data


In [None]:
from typing import List
import os

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.docstore.document import Document

BASE_PATH = "./knowledge/t6"
if not os.path.exists(BASE_PATH):
    raise ValueError(f"Directory {BASE_PATH} does not exist")

# prepare the documents
loader = DirectoryLoader(
    path=BASE_PATH, loader_cls=TextLoader, glob="**/*.rst", exclude=["index.rst"]
)
documents: List[Document] = loader.load()


## Preprocess the data

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    chunk_size=2000,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(documents)

## Prepare knowledge base

In [None]:
from dotenv import load_dotenv
load_dotenv(".env")

# these variables are required to initialize Langchain AzureChatOpenAI instance
required_env_vars = [
    "AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_API_VERSION",
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_MODEL",
    "AZURE_OPENAI_DEPLOYMENT_NAME",
    "AZURE_OPENAI_EMBEDDING_MODEL",
    "AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME",
]

for var in required_env_vars:
    if os.environ.get(var) is None:
        raise Exception(f"Missing `{var}` environment variable")


api_key = os.environ.get("AZURE_OPENAI_API_KEY", "")
api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2023-03-15-preview")
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", "https://public-api.grabgpt.managed.catwalk-k8s.stg-myteksi.com")
deployment_name=os.environ.get("AZURE_OPENAI_DEPLOYMENT_NAME", "gpt-4-turbo")
model=os.environ.get("AZURE_OPENAI_MODEL", "gpt-4-turbo")
embedding_model=os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-large")
embedding_deployment_name=os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME", "text-embedding-3-large")

In [None]:
# make sure the knowledge base is empty
! rm -rf ./docs

In [None]:
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = AzureOpenAIEmbeddings(
    api_key=api_key,
    api_version=api_version,
    azure_deployment=embedding_deployment_name,
)

persist_directory = './docs/chroma/'

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain_core.runnables import (
    RunnableSequence,
    RunnablePassthrough,
    RunnableParallel,
)
from langchain_core.output_parsers import StrOutputParser

ADVIRSOR_PROMPT = """
You are a helpful advisor, collaborating with other agents. \
Don't assume anything you don't know. Use the context below to answer the user question\
Think carefully about the question and provide the best answer you can. \
If you are unable to fully answer, that's OK, another agent with different tools will help where you left off. \
If you or any of the other agents have the final answer or deliverable, \
prefix your respond with FINAL ANSWER so the team knows to stop.
The context is: {context}.\n
"""

context = "\n".join([doc.page_content for doc in documents])
chain_prompt = ChatPromptTemplate.from_messages(
    messages=[
        ("system", ADVIRSOR_PROMPT),
        ("user", "{input}"),
    ]
)

retrieve_chain = RunnableSequence(
    lambda x: x["input"],
    vectordb.as_retriever(),
)


def format_docs(docs: List[Document]):
    return "\n".join([doc.page_content for doc in docs])


llm = AzureChatOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=azure_endpoint,
    deployment_name=deployment_name,
    temperature=0,
)

chain = RunnableSequence(
    RunnablePassthrough.assign(
        candidates=retrieve_chain.with_config(run_name="retrieve_chain"),
    ),
    RunnableParallel(
        answer=RunnableSequence(
            RunnablePassthrough.assign(context=lambda x: format_docs(x["candidates"])),
            chain_prompt,
            llm.with_config(run_name="llm"),
            StrOutputParser().with_config(run_name="parser"),
        ),
        references=lambda docs: [doc.metadata for doc in docs["candidates"]],
    ),
).with_config(run_name="naive_chain")

In [None]:
user_query =(
    "I have error `Fail to push image` while running cop_image:envoy-base step in "
    "pre stage while setting up t6 fabric pipeline, how to resolve it?"
)

from openai import RateLimitError

try:
    output = chain.invoke({"input": user_query})
    print(output["answer"])
    for ref in output["references"]:
        print(ref)
except RateLimitError as e:
    print("Exceed rate limit")
    print(str(e))
