# Chatbot v1.0

- Embedding model: nomic-embed-text
- LLM model: llama3:latest

## requirements

In [40]:
# !pip install langchain
# !pip install chromadb
# !pip install pypdf
# !pip isntall pytest

## Embedding Function

In [41]:
# chromaPath= 'chroma/open'
chromaPath= 'chroma/data'
# chromaPath = 'chroma/mxbai'
# chromaPath = 'chroma/bedrock'

In [42]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings

# def get_embedding_function():
#     embeddings=BedrockEmbeddings(
#         credentials_profile_name='default',region_name='us-east-1'
#     )
#     return embeddings

def get_embedding_function():
    embeddings=OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

# def get_embedding_function():
#     embeddings=OllamaEmbeddings(model="mxbai-embed-large")
#     return embeddings

# def get_embedding_function():
#     embeddings = HuggingFaceInferenceAPIEmbeddings(
#         api_key='',
#         model_name="amazon/Titan-text-embeddings-v2"
#         )
#     return embeddings


In [43]:
embeddings=get_embedding_function()
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

[0.7909193634986877, 0.6172391176223755, -3.102604389190674]

## Chroma-DB

In [44]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
# from get_embedding_function import get_embedding_function
from langchain.vectorstores.chroma import Chroma

In [45]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

def load_documents():
    DATA_PATH = 'Data'
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


In [46]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_document(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

In [47]:
documents=load_documents()
chunks=split_document(documents)
print(chunks[0])

page_content='How update a certificate into WAF ?\nCurrently below endpoints are protected by WAF.\nProduction:\nhttps://aero-suite-prod-airarabia.accelaero.com/\nhttps://aero-pay-prod-airarabia.accelaero.com\nhttps://aero-pay-callbackapi-prod-airarabia.accelaero.com\nStagingX\nhttps://aero-suite-stage1-airarabia.isaaviation.net/ \nWhat is the procedure to upload certificate into WAF ?\n \n1. Log into Oracle Console and navigate to Edge Policy Resources\n   Oracle Console => Security and Identity => Web Application Firewall => OCI Edge Policy Resources\n    Example: Oracle \n2. Create a Certificate by providing SSL Certificate and Private Key.\nWe need to do below additional step in our use cases as Oracle is unable to identify the encrypted version.' metadata={'source': 'Data/AVN-How update a certificate into WAF _-020524-111955.pdf', 'page': 0}


In [48]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [49]:
from langchain.vectorstores.chroma import Chroma

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=chromaPath, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [50]:
def clear_database():
    if os.path.exists(chromaPath):
        shutil.rmtree(chromaPath)

In [51]:
def main(f):
    if f=='reset':
        print("✨ Clearing Database")
        clear_database()
    # Create (or update) the data store.
    documents = load_documents()
    chunks = split_document(documents)
    add_to_chroma(chunks)

In [52]:
main(1)

Number of existing documents in DB: 65
✅ No new documents to add


## Query Data

In [53]:
import argparse
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama


In [54]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [55]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=chromaPath, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    # model = Ollama(model="mistral")
    model = Ollama(model="llama3:latest")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [None]:
# gpt4
query_rag("What is Grafana")