# Chatbot v1.0

## requirements

In [2]:
# !pip install langchain
# !pip install chromadb
# !pip install pypdf
# !pip isntall pytest

## Embedding Function

In [1]:
chromaPath = 'chroma/mxbai'

In [2]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings

# def get_embedding_function():
#     embeddings=BedrockEmbeddings(
#         credentials_profile_name='default',region_name='us-east-1'
#     )
#     return embeddings

def get_embedding_function():
    embeddings=OllamaEmbeddings(model="mxbai-embed-large")
    return embeddings


## Chroma-DB

In [3]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
# from get_embedding_function import get_embedding_function
from langchain.vectorstores.chroma import Chroma

In [4]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

def load_documents():
    DATA_PATH = 'Data'
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_document(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

In [6]:
documents=load_documents()
chunks=split_document(documents)
print(chunks[0])

page_content='Docker Vulnerability Scan - Anchore\nSecurity vulnerability scans on docker images can be done using Anchore. Anchore is an opensource tool to scan docker images. It has a \nAnchore engine which scans the docker image and creates a scan report. Anchore also has a Anchore Jenkins plugin that can be used to \nintegrate Anchore to the Jenkins. This Jenkins plugin can be used in the build pipelines.\nThe plugin will send a list of docker images to the Anchore engine. The Anchore engine will pull the docker image and scans the image \nbased on the configured policies. Once the scan is complete the result is send back to the Jenkins plugin. Based on the result the Jenkins \nplugin will make the build stage failed/passed.' metadata={'source': 'Data/AVN-Docker Vulnerability Scan - Anchore-020524-074749.pdf', 'page': 0}


In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
from langchain.vectorstores.chroma import Chroma

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=chromaPath, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [10]:
def clear_database():
    if os.path.exists(chromaPath):
        shutil.rmtree(chromaPath)

In [11]:
def main(f):
    if f=='reset':
        print("✨ Clearing Database")
        clear_database()
    # Create (or update) the data store.
    documents = load_documents()
    chunks = split_document(documents)
    add_to_chroma(chunks)

In [12]:
main(1)

Number of existing documents in DB: 0
👉 Adding new documents: 18


  warn_deprecated(


## Query Data

In [13]:
import argparse
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [14]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [15]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=chromaPath, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="llama3:latest")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [27]:
query_rag("What is Grafana")

Response: According to the given context, Grafana is an analytics and interactive data visualization tool that is multi-platform and open-source. It allows users to create dashboards with customizable graphs, charts, stats, and more, using data from various sources such as Prometheus, Graphite, ElasticSearch, MySQL, etc., shown as time-series analytics with timestamps.
Sources: ['Data/AVN-Grafana-020524-074903.pdf:1:0', 'Data/AVN-Grafana-020524-074903.pdf:0:1', 'Data/AVN-Grafana-020524-074903.pdf:0:0', 'Data/AVN-Grafana-020524-074903.pdf:5:0', 'Data/AVN-Grafana-020524-074903.pdf:3:0']


'According to the given context, Grafana is an analytics and interactive data visualization tool that is multi-platform and open-source. It allows users to create dashboards with customizable graphs, charts, stats, and more, using data from various sources such as Prometheus, Graphite, ElasticSearch, MySQL, etc., shown as time-series analytics with timestamps.'

In [28]:
query_rag("What is Prometheus")

Response: Based on the given context, Prometheus is a multi-dimensional data model with time series data identified by metric name and key/value pairs. It also comes with a flexible query language called PromQL to leverage this dimensionality.
Sources: ['Data/AVN-Grafana-020524-074903.pdf:2:0', 'Data/AVN-Prometheus-020524-074948.pdf:0:1', 'Data/AVN-Grafana-020524-074903.pdf:6:0', 'Data/AVN-Prometheus-020524-074948.pdf:2:0', 'Data/AVN-Prometheus-020524-074948.pdf:3:0']


'Based on the given context, Prometheus is a multi-dimensional data model with time series data identified by metric name and key/value pairs. It also comes with a flexible query language called PromQL to leverage this dimensionality.'

In [16]:
## mxbai
query_rag("What is Grafana")

Response: Based on the provided context, it can be inferred that:

Grafana is a third-party tool used to display metrics collected by Prometheus. It is mentioned in Step 5 as an example Node Exporter dashboard for Grafana. This implies that Grafana is a visualization platform for time series data, allowing users to create dashboards and charts based on the metrics scraped from Prometheus targets.
Sources: ['Data/AVN-Docker Vulnerability Scan - Anchore-020524-074749.pdf:0:1', 'Data/AVN-Prometheus-020524-074948.pdf:0:0', 'Data/AVN-Docker Vulnerability Scan - Anchore-020524-074749.pdf:0:0', 'Data/AVN-Grafana-020524-074903.pdf:0:1', 'Data/AVN-Grafana-020524-074903.pdf:4:0']


'Based on the provided context, it can be inferred that:\n\nGrafana is a third-party tool used to display metrics collected by Prometheus. It is mentioned in Step 5 as an example Node Exporter dashboard for Grafana. This implies that Grafana is a visualization platform for time series data, allowing users to create dashboards and charts based on the metrics scraped from Prometheus targets.'