# Pre-requisites
- WSL
- Miniconda3 

# Setup environment
- Create conda env `conda create langchain python=3.11`
- Set the "langchain" env that has been just created as the running env in VS code


Install langchain and openai package

In [None]:
! pip install langchain openai

# Init variables

You need to set value of `OPENAI_API_KEY` that you get from the training team in the .env file

In [11]:
import openai, os
from dotenv import load_dotenv

load_dotenv()

openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")
deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")


# Overviews
The BonBon FAQ.pdf file contains frequently asked questions and answers for customer support scenario. The topics are around IT related issue troubleshooting such as networking, software, hardware. You are requested to provide a solution to build a chat bot capable of answering the user questions with LangChain.

## Assignment 1: Document Indexing (mandatory)

- The content of BonBon FAQ.pdf should be indexed to the local Chroma vector DB from where the chatbot can lookup the appropriate information to answer questions.
- Should use some embedding model such as Azure Open AI text-embedding-3-small to create vectors, feel free to use any other open source embedding model if it works.

In [None]:
import os
import dotenv
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SimpleField, SearchField, SearchFieldDataType,
    SearchableField, VectorSearch, HnswAlgorithmConfiguration,
    VectorSearchProfile, VectorSearchAlgorithmKind, HnswParameters
)
from azure.search.documents.models import VectorizedQuery

# ===== Load ENV =====
dotenv.load_dotenv()

AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")

AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")

AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"
AZURE_SEARCH_INDEX = "gptkbindex-pdf"

# ===== Load PDF file =====
def read_pdf_by_page(file_path):
    pages = []
    with fitz.open(file_path) as doc:
        for page in doc:
            text = page.get_text()
            pages.append(text)
    return pages

pages = read_pdf_by_page("./data/BonBon FAQ.pdf")

# ===== Split page into semantic chunks =====
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", ""]
)

sourcefile_name = "bonbon_faq.pdf"
chunks_with_metadata = []
for page_number, page_text in enumerate(pages, start=1):
    sub_chunks = splitter.split_text(page_text)
    for chunk in sub_chunks:
        chunks_with_metadata.append({
            "content": chunk,
            "page": page_number,
            "sourcefile": sourcefile_name
        })

print(f"Total chunks: {len(chunks_with_metadata)}")

# ===== Setup Azure OpenAI client =====
openai_client = openai.AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
)

def get_embedding(text):
    response = openai_client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        input=text
    )
    return response.data[0].embedding

# ===== Generate embeddings & prepare documents =====
documents = []
for i, chunk in enumerate(chunks_with_metadata):
    embedding = get_embedding(chunk["content"])
    documents.append({
        "id": str(i),
        "content": chunk["content"],
        "embedding": embedding,
        "sourcefile": chunk["sourcefile"],
        "page": chunk["page"]
    })

# ===== Create index (once) =====
search_cred = AzureKeyCredential(AZURE_SEARCH_API_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=search_cred)

index = SearchIndex(
    name=AZURE_SEARCH_INDEX,
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SimpleField(name="sourcefile", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="page", type=SearchFieldDataType.Int32, filterable=True),
        SearchField(
            name="embedding",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="embedding_profile"
        )
    ],
    vector_search=VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="hnsw_config",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(metric="cosine")
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="embedding_profile",
                algorithm_configuration_name="hnsw_config"
            )
        ]
    )
)

# Tạo index nếu chưa tồn tại
try:
    index_client.get_index(AZURE_SEARCH_INDEX)
    print("Index already exists.")
except:
    print("Creating new index...")
    index_client.create_index(index)

# ===== Upload documents =====
search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX,
    credential=search_cred
)

print("Uploading documents...")
upload_result = search_client.upload_documents(documents=documents)
print(f"Upload result: {upload_result}")

Total chunks: 63
Index already exists.
Uploading documents...
Upload result: [<azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f2174f750>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f2174c650>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f2174e6d0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f2174cfd0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f2156a9d0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f21568250>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f217d81d0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f217d88d0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x7f0f217dbf90>, <azure.search.documents._generated.models._models_py3.Inde

In [39]:
# ===== Search thử bằng vector =====
search_query = "How do I access shared network drives?"
search_vector = get_embedding(search_query)

print("\n=== Search Results ===")
results = search_client.search(
    search_text=None,
    top=5,
    vector_queries=[
        VectorizedQuery(
            vector=search_vector,
            k_nearest_neighbors=5,
            fields="embedding"
        )
    ],
    filter=f"sourcefile eq '{sourcefile_name}'"
)

for doc in results:
    print(f"[Page {doc['page']}] Score: {doc['@search.score']:.4f}")
    print(doc["content"][:200])
    print("---")



=== Search Results ===
[Page 9] Score: 0.7684
2) Know the Shared Drive Path: 
• 
You should have the network path or UNC (Universal Naming Convention) of the shared 
drive. It typically looks like this: \\computername\sharename or \\IP_address\sh
---
[Page 9] Score: 0.7612
5) In File Explorer, go to "This PC." 
• 
Click on "Computer" in the top menu and select "Map network drive." 
• 
Choose a drive letter and enter the UNC path (e.g., \\server\share). 
• 
Check the box
---
[Page 9] Score: 0.7261
• 
If you haven't mapped the drive, you can directly access it by entering the UNC path in the 
address bar of File Explorer and pressing Enter. 
 
7) Provide Credentials (if required): 
• 
If the sha
---
[Page 10] Score: 0.7168
8) Access Files and Folders: 
• 
Once you're connected to the shared drive, you can browse, open, and manage files and 
folders just like you would on your local drive. 
Please note that the exact ste
---
[Page 9] Score: 0.6889
• 
If all else fails, uninstall the prin

## Assignment 2: Building Chatbot (mandatory)
- You are requested to build a chatbot solution for customer support scenario using Conversational ReAct agent supported in LangChain
- The chatbot is able to support user to answer FAQs in the sample BonBon FAQ.pdf file.
- The chatbot should use Azure Open AI GPT-4o LLM as the reasoning engine.
- The chatbot should be context aware, meaning that it should be able to chat with users in the conversation manner.
- The agent is equipped the following tools:
  - Internet Search: Help the chatbot automatically find out more about something using Duck Duck Go internet search
  - Knowledge Base Search: Help the chatbot to lookup information in the private knowledge base
- In case user asks for information related to topics in the BonBon FAQ.pdf file such as internet connection, printer, malware issues the chatbot must use the private knowledge base, otherwise it should search on the internet to answer the question.
- In the answer of chatbot, it should mention the source file and the page that the answer belongs to, for example the answer should mention "BonBon FQA.pdf (page 2)"

In [49]:
# from llm import get_langchain_azure_openai_llm
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.tools import DuckDuckGoSearchRun
# from langchain.chains import LLMathChain
from langchain.tools import Tool
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0,
    model="gpt-4o",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# llm_math = LLMathChain.from_llm(llm)

tools = [
    DuckDuckGoSearchRun(description="use this tool to search for information on internet"),
    # Tool(
    #     name="calculator",
    #     func=llm_math.run,
    #     description="use this tool for math calculating"
    # )
]

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant.",
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

# Construct the Tools agent
agent = create_tool_calling_agent(llm=llm, tools=tools, prompt=prompt)

# Create an agent executor by passing in the agent and tools
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

while True:
    question = input("Human: ")
    if question == "exit":
        break
    
    result = agent_executor.invoke({"input": question})
    print(f"AI: {result['output']}")

NotImplementedError: 

## Assignment 3: Build a new assistant based on BonBon source code (optional)
The objective
- Run the code and index the sample BonBon FAQ.pdf file to Azure Cognitive Search
- Explore the code and implement a new assistant that has the same behavior as above
- Explore other features such as RBACs, features on admin portal

Please contact the training team in case you need to get the source code of BonBon.