# Pre-requisites
- Python have been installed
- Jupyter environment set up in PyCharm or Visual Studio Code

# Setup environment
Install required package


In [None]:
! pip install --upgrade pip
! pip install langchain langchain_community langchain_openai openai python-dotenv pypdf chromadb pysqlite3-binary

# Init variables
You need to set value of `OPENAI_API_KEY` that you get from the training team in the `.env` file

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

AZURE_OPENAI_DOMAIN = os.getenv("AZURE_OPENAI_DOMAIN")
AZURE_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_OPENAI_API_ENDPOINT = f"https://{AZURE_OPENAI_DOMAIN}.openai.azure.com"
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_EMBEDDING_MODEL = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Overviews
The BonBon FAQ.pdf file contains frequently asked questions and answers for customer support scenario. The topics are around IT related issue troubleshooting such as networking, software, hardware. You are requested to provide a solution to build a chat bot capable of answering the user questions with LangChain.

## Assignment 1: Document Indexing (mandatory)
- The content of BonBon FAQ.pdf should be indexed to the local Chroma vector DB from where the chatbot can lookup the appropriate information to answer questions.
- Should use some embedding model such as Azure Open AI text-embedding-ada-002 to create vectors, feel free to use any other open source embedding model if it works.

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "./data/BonBon FAQ.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

# Initialize the RecursiveCharacterTextSplitter for splitting the documents into smaller chunks
# - chunk_size: Maximum size of each text chunk (here, 1000 characters)
# - chunk_overlap: Overlapping content between consecutive chunks (here, 200 characters)
# This ensures that the content of the document is split while maintaining continuity across chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the loaded documents into smaller chunks using the text splitter
# This method returns a list of chunks, where each chunk is a portion of the original document.
text_chunks = text_splitter.split_documents(docs)

In [None]:
# Run this to fix an issue system has an unsupported version of sqlite3
import pysqlite3
import sys

sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings

# Initialize the AzureOpenAIEmbeddings with the appropriate configuration
# - model: The deployment name of the Azure OpenAI embedding model.
# - api_version: The version of the Azure OpenAI API you're using.
# - api_key: Your Azure OpenAI API key for authentication.
# - azure_endpoint: The Azure endpoint where the OpenAI service is deployed.
embedding = AzureOpenAIEmbeddings(model=AZURE_OPENAI_EMBEDDING_MODEL,
                                   api_version=AZURE_OPENAI_API_VERSION,
                                   api_key=AZURE_OPENAI_API_KEY,
                                   azure_endpoint=AZURE_OPENAI_API_ENDPOINT)

# Create a Chroma vector store using the document chunks and embeddings
# - documents: The list of text chunks generated from the PDF (previously split).
# - embedding: The initialized AzureOpenAIEmbeddings object for converting documents into vector embeddings.
vectorstore = Chroma.from_documents(
    documents=text_chunks,  # List of text chunks from the document
    embedding=embedding     # AzureOpenAI embeddings to generate vector representations
)

# Convert the Chroma vector store into a retriever object
# - The retriever enables similarity searches based on vector embeddings, useful for querying the document.
retriever = vectorstore.as_retriever()

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Initialize the Azure OpenAI chat model with specific configurations
llm = AzureChatOpenAI(
    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,  # The name of your Azure OpenAI deployment
    api_version=AZURE_OPENAI_API_VERSION,           # Version of the Azure OpenAI API you are using
    azure_endpoint=AZURE_OPENAI_API_ENDPOINT,       # The API endpoint to interact with Azure OpenAI
    temperature=0,                                  # Temperature of 0 ensures deterministic output for factual QA tasks
    max_tokens=None,                                # No strict limit on token generation (defaults may still apply)
    timeout=None,                                   # No timeout explicitly set for API requests
    max_retries=2,                                  # Allows for up to 2 retries in case of API call failures
    streaming=False                                 # False indicates the response will not be streamed incrementally
)

# Define the prompt structure for the conversation
prompt = ChatPromptTemplate.from_messages(
    [
        # The system message defines the role of the model and context for the task
        (
            "system",
            "You are an assistant for question-answering tasks."
            "Use the following pieces of IT related issue troubleshooting such as networking, software, hardware to answer the question."
            "If you don't know the answer, say that you don't know.\n\n{context}"
        ),
        # The human message provides the user's question input
        ("human", "{input}"),
    ]
)

# Create a document-based question-answering chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create a Retrieval-Augmented Generation (RAG) chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
# Infinite loop for the chat interaction
while True:
    question = input("Human: ")
    
    if question == "exit":
        break

    result = rag_chain.invoke({"input": question})
    
    metadata = result['context'][0].metadata
    filename = metadata['source'].split('/')[-1]
    page = metadata['page']

    print(f"AI: {result['answer']}\nSource: {filename} (page {page})")

## Assignment 2: Building Chatbot (mandatory)
- You are requested to build a chatbot solution for customer support scenario using Conversational ReAct agent supported in LangChain
- The chatbot is able to support user to answer FAQs in the sample BonBon FAQ.pdf file.
- The chatbot should use Azure Open AI GPT-3.5 LLM as the reasoning engine.
- The chatbot should be context aware, meaning that it should be able to chat with users in the conversation manner.
- The agent is equipped the following tools:
  - Internet Search: Help the chatbot automatically find out more about something using Duck Duck Go internet search
  - Knowledge Base Search: Help the chatbot to lookup information in the private knowledge base
- In case user asks for information related to topics in the BonBon FAQ.pdf file such as internet connection, printer, malware issues the chatbot must use the private knowledge base, otherwise it should search on the internet to answer the question.
- In the answer of chatbot, it should mention the source file and the page that the answer belongs to, for example the answer should mention "BonBon FQA.pdf (page 2)"

In [None]:
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.memory import ConversationBufferMemory
from langchain.tools.retriever import create_retriever_tool

# Define the tools the agent can use

# Create a retriever tool using a pre-existing retriever (e.g., a document retriever)
tools = [
    create_retriever_tool(
        retriever=retriever,
        name="HelpDesk",
        description="Use this tool to answer the related issue troubleshooting such as networking, software, hardware.",
    ),
    
    # Adding DuckDuckGo search tool for web search functionality
    DuckDuckGoSearchRun(description="Use this tool to search information in the internet if don't have answer from another tools"),
]

# Define a prompt template for the agent's conversation
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            "You are an assistant for question-answering tasks."
            "You can answer question about IT related issues based on private documents."
            "If you can't get information from the private documents. Searching for the information from the internet instead."
            "If you don't know the answer, say that you don't know."
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}")
    ]
)

# Create a tool-calling agent using the defined LLM and the set of tools
agent = create_tool_calling_agent(llm, tools, prompt)

# Use ConversationBufferMemory to store and manage the chat history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Create an agent executor, which manages the interaction between the agent, tools, and memory
agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True)

In [None]:
while True:
    question = input("Human: ")
    if question == "exit":
        break
        
    result = agent_executor.invoke({"input": question})
    print(f"AI: {result['output']}")

## Assignment 3: Build a new assistant based on BonBon source code (optional)
The objective
- Run the code and index the sample BonBon FAQ.pdf file to Azure Cognitive Search
- Explore the code and implement a new assistant that has the same behavior as above
- Explore other features such as RBACs, features on admin portal

Please contact the training team in case you need to get the source code of BonBon.