# Building an ITSG-33 RAG chatbot with LangChain, Hugging Face, Chroma, and Ollama

In [22]:
%%sh
pip install --upgrade langchain langchain-core langchain_experimental chromadb

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4.3 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl.metadata (6.3 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.25.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentel

In [2]:
import json
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms.sagemaker_endpoint import LLMContentHandler


## Connect to the LLM on Ollama

In [3]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions

llm = OllamaFunctions(
    model="llama3:latest",
    base_url='http://ollama-server-2:11434',
    format="json",
    temperature=0,
    top_p=0,
    top_k=40
)  

llm.invoke("Tell me a joke")

AIMessage(content="Why don't scientists trust atoms? Because they make up everything!", id='run-7e9b7481-fc54-4396-8e61-8e1689c1a993-0')

## Configure LLM in LangChain

In [4]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You're a helpful AI assistant. Given a user question "
    "and some ITSG-33 article snippets, answer the user "
    "question. If none of the articles answer the question, "
    "just say you don't know."
    "\n\nHere are the ITSG-33 articles: "
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
prompt.pretty_print()


You're a helpful AI assistant. Given a user question and some ITSG-33 article snippets, answer the user question. If none of the articles answer the question, just say you don't know.

Here are the ITSG-33 articles: [33;1m[1;3m{context}[0m


[33;1m[1;3m{input}[0m


## Load the ITSG33 Control Catalog PDF

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import RetrievalQA

In [6]:
control_catalog = '/home/ec2-user2/Downloads/itsg33-ann3a-eng.pdf'

### Load the pdf and split it into pages

In [17]:
%%time

loader = PyPDFLoader(control_catalog)
data = loader.load()

CPU times: user 10.1 s, sys: 17 ms, total: 10.1 s
Wall time: 10.1 s


In [19]:
%%time
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
all_splits = text_splitter.split_documents(data)

CPU times: user 40.8 ms, sys: 371 µs, total: 41.2 ms
Wall time: 40.2 ms


In [20]:
%%time
# Define embedding model
# See https://huggingface.co/spaces/mteb/leaderboard

embedding_model_id = "BAAI/bge-small-en-v1.5"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)



CPU times: user 760 ms, sys: 88.9 ms, total: 848 ms
Wall time: 629 ms


In [23]:
%%time
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

CPU times: user 8min 30s, sys: 31.2 s, total: 9min 1s
Wall time: 2min 20s


In [24]:
%%time
doc_retriever = vectorstore.as_retriever(
#    search_type="similarity_score_threshold",
#    search_kwargs={'score_threshold': 0.8}
)

CPU times: user 69 µs, sys: 5 µs, total: 74 µs
Wall time: 76.8 µs


In [25]:
%%time
from typing import List

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: List[Document]):
    return "\n\n".join(doc.page_content for doc in docs)

def passthrough_debug(x):
    print(f"\n\nReceived input:")
    print(x)
    return x
    
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    ##| passthrough_debug
    | prompt
    ##| passthrough_debug
    | llm
    | passthrough_debug
    | StrOutputParser()
)

retrieve_docs = (lambda x: x["input"]) | doc_retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

CPU times: user 317 µs, sys: 21 µs, total: 338 µs
Wall time: 341 µs


In [26]:
%%time

result = chain.invoke({"input": "What is Media Protection?"})



Received input:
content='Media protection refers to the safeguards put in place to protect digital and non-digital media, including information system media, during transport, storage, or disposal. This includes measures such as locking containers, using cryptography, and controlling access to ensure the confidentiality, integrity, and availability of the media.' id='run-3a3c5d0c-0f4a-454e-915d-4c84ab163548-0'
CPU times: user 784 ms, sys: 183 ms, total: 967 ms
Wall time: 4.76 s


In [27]:
print(result.keys())

dict_keys(['input', 'context', 'answer'])


In [28]:
print(result["context"][0])

page_content='UNCLASSIFIED  
 
IT Security Risk Management: A Lifecycle Approach  (ITSG -33) 
Annex 3A – Security Control Catalogue  
 
  December 2014  131 
 (B) The organization protects information system media until the media are dest royed or sanitized using 
approved equipment, techniques, and procedures . 
Supplemental Guidance:  Information system media includes both digital and non -digital media. Digital media' metadata={'page': 143, 'source': '/home/ec2-user2/Downloads/itsg33-ann3a-eng.pdf'}


In [29]:
print(result["answer"])

Media protection refers to the safeguards put in place to protect digital and non-digital media, including information system media, during transport, storage, or disposal. This includes measures such as locking containers, using cryptography, and controlling access to ensure the confidentiality, integrity, and availability of the media.


In [30]:
from langchain_core.pydantic_v1 import BaseModel, Field


class CitedAnswer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based only on the given sources.",
    )
    citations: List[int] = Field(
        ...,
        description="The integer IDs of the SPECIFIC sources which justify the answer.",
    )



In [31]:
structured_llm = llm.with_structured_output(CitedAnswer)

In [52]:
def format_docs_with_id(docs: List[Document]) -> str:
    formatted = []
    for i, doc in enumerate(docs):
        try:
            title = doc.metadata['title']
        except KeyError:
            title = "none"
        doc_info = f"Source ID: {i}\nArticle Title: {title}\nArticle Snippet: {doc.page_content}"
        formatted.append(doc_info)
    return "\n\n" + "\n\n".join(formatted)


rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_id(x["context"])))
    | prompt
    | structured_llm
)

retrieve_docs = (lambda x: x["input"]) | doc_retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

In [54]:
%%time


result = chain.invoke({"input": "What is Media Protection?"})
print(result["answer"])

answer='Media protection refers to the safeguards implemented to protect information system media, including both digital and non-digital media, during transport, storage, and disposal. This includes measures such as locked containers, cryptography, and access controls to prevent unauthorized access, modification, or destruction of the media.' citations=[0, 1]
CPU times: user 751 ms, sys: 113 ms, total: 864 ms
Wall time: 1.79 s


In [56]:
%%time

print(result["context"][0])

page_content='UNCLASSIFIED  
 
IT Security Risk Management: A Lifecycle Approach  (ITSG -33) 
Annex 3A – Security Control Catalogue  
 
  December 2014  131 
 (B) The organization protects information system media until the media are dest royed or sanitized using 
approved equipment, techniques, and procedures . 
Supplemental Guidance:  Information system media includes both digital and non -digital media. Digital media' metadata={'page': 143, 'source': '/home/ec2-user2/Downloads/itsg33-ann3a-eng.pdf'}
CPU times: user 41 µs, sys: 3 µs, total: 44 µs
Wall time: 47.2 µs


In [58]:
%%time
print(result["context"][1])

page_content='information residing on the media  and consistent with GC legislation and TBS policies, directives  and standards . 
Safeguards to  protect media during transport include, for example, locked containers and cryptography. 
Cryptographic mechanisms can provide confidentiality and integrity protections depending upon the mechanisms 
used. Activities associated with transport include the ac tual transport as well as those activities such as releasing' metadata={'page': 144, 'source': '/home/ec2-user2/Downloads/itsg33-ann3a-eng.pdf'}
CPU times: user 41 µs, sys: 3 µs, total: 44 µs
Wall time: 47 µs


In [59]:
class Citation(BaseModel):
    source_id: int = Field(
        ...,
        description="The integer ID of a SPECIFIC source which justifies the answer.",
    )
    quote: str = Field(
        ...,
        description="The VERBATIM quote from the specified source that justifies the answer.",
    )


class QuotedAnswer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based only on the given sources.",
    )
    citations: List[Citation] = Field(
        ..., description="Citations from the given sources that justify the answer."
    )

In [61]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs_with_id(x["context"])))
    | prompt
    | llm.with_structured_output(QuotedAnswer)
)

retrieve_docs = (lambda x: x["input"]) | doc_retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

In [63]:
result = chain.invoke({"input": "What is Media Protection?"})
result["answer"]

QuotedAnswer(answer='Media protection refers to the safeguards implemented by an organization to protect information system media, including both digital and non-digital media, during transport, storage, and disposal. This includes measures such as locking containers, using cryptography, and applying access controls for mobile devices.', citations=[Citation(source_id=0, quote='(B) The organization protects information system media until the media are destroyed or sanitized using approved equipment, techniques, and procedures.'), Citation(source_id=1, quote='Safeguards to protect media during transport include, for example, locked containers and cryptography.')])