# Perform the RAG Query / Response



In [20]:
import os, re
import chromadb
from langchain.prompts import ChatPromptTemplate

In [3]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [4]:
chroma_client = chromadb.HttpClient(host='localhost', port=8200)

In [9]:
chroma_client.list_collections()

[Collection(id=63b82a61-cba4-421c-92ee-a54be68ae7f1, name=local_kb)]

In [5]:
collection_name = "local_kb"

chroma_collection = chroma_client.get_collection(name=collection_name, embedding_function=sentence_transformer_ef)

In [7]:
chroma_collection.count()

12432

In [10]:
query_text = "This is a query about machine learning and data science"

In [11]:
results = chroma_collection.query(
    query_texts=[ query_text ],
    n_results=3
)

print(results)

{'ids': [['A Primer on Generative Artificial Intelligence.pdf_2_2576', 'A Primer on Generative Artificial Intelligence.pdf_2_2989', 'Generative AI Models.pdf_12_3050']], 'distances': [[0.8318580389022827, 0.8395541310310364, 0.8767372369766235]], 'embeddings': None, 'metadatas': [[{'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/AIML/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2576}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/AIML/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2989}, {'filename': 'Generative AI Models.pdf', 'page': 12, 'source': '../data/Generative AI Models.pdf', 'start_index': 3050}]], 'documents': [['of machine learning (ML), as under:\n• Machine learning “allows the computer to learn automatically without human inter-\nvention or assistance” ([5], p. 386).\n• “Machine Learning is about making computers modify or adapt th

In [13]:
def print_dict( dict_item, name):
    print(f"\nDictionary: {name}")
    for key in dict_item.keys():
        print(f"  {key}: {dict_item[key]}")
        
def print_list( list_items, name):
    print(f"\nList: {name}")
    for i, item in enumerate(list_items):
        print(f"  {i}: {item}")

In [14]:
print_dict(results, 'results')


Dictionary: results
  ids: [['A Primer on Generative Artificial Intelligence.pdf_2_2576', 'A Primer on Generative Artificial Intelligence.pdf_2_2989', 'Generative AI Models.pdf_12_3050']]
  distances: [[0.8318580389022827, 0.8395541310310364, 0.8767372369766235]]
  embeddings: None
  metadatas: [[{'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/AIML/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2576}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/AIML/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2989}, {'filename': 'Generative AI Models.pdf', 'page': 12, 'source': '../data/Generative AI Models.pdf', 'start_index': 3050}]]
  documents: [['of machine learning (ML), as under:\n• Machine learning “allows the computer to learn automatically without human inter-\nvention or assistance” ([5], p. 386).\n• “Machine Learning is about making computers mo

In [None]:
def check_distances(distances, threshold=0.7):
    if not distances or not distances[0]:
        return True  # List is empty
#    return all(score < threshold for score in distances[0])
    return any(score < threshold for score in distances[0])

In [None]:
if check_distances(results['distances'], 0.7):
    print(f"Unable to find matching results.")

In [None]:
# Extract all filenames
metadata = results['metadatas']
filenames = [metadata['filename'] for sublist in metadatas for metadata in sublist]
unique_filenames = list(set(filenames))

In [24]:
def clean_and_join_documents(documents):
    # Flatten the list of lists
    flattened_docs = [item for sublist in documents for item in sublist]
    
    # Remove parts like ([18], p. 5) using regex
    # cleaned_docs = [re.sub(r'\(\[\d+\], p\. \d+\)', '', doc) for doc in flattened_docs]
    
    # # Join the cleaned documents with the specified delimiter
    # joined_text = "\n\n - -\n\n".join(cleaned_docs)
    joined_text = "\n\n - -\n\n".join(flattened_docs)
    
    return joined_text

# Clean and join the documents
context_text = clean_and_join_documents(results['documents'])
print(context_text)

of machine learning (ML), as under:
• Machine learning “allows the computer to learn automatically without human inter-
vention or assistance” ([5], p. 386).
• “Machine Learning is about making computers modify or adapt their actions (whether

 - -

reflect the correct ones.” ([18], p. 5)
• “Machine learning is considered an extension of predictive analytics. It occurs when
systems of algorithms automatically improve themselves based on data patterns,
experiences, and observations” ([6], p. 287).

 - -

it (e.g., weights of a neural network) or information about its training data  (BSI, 2023 (1)) .


In [25]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

In [29]:
# Create prompt template using context and query text
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

In [28]:
print(prompt)

Human: 
Answer the question based only on the following context:
of machine learning (ML), as under:
• Machine learning “allows the computer to learn automatically without human inter-
vention or assistance” ([5], p. 386).
• “Machine Learning is about making computers modify or adapt their actions (whether

 - -

reflect the correct ones.” ([18], p. 5)
• “Machine learning is considered an extension of predictive analytics. It occurs when
systems of algorithms automatically improve themselves based on data patterns,
experiences, and observations” ([6], p. 287).

 - -

it (e.g., weights of a neural network) or information about its training data  (BSI, 2023 (1)) .
 - -
Answer the question based on the above context: This is a query about machine learning and data science



In [30]:
from langchain_openai import OpenAI

#Same as client in section 1.
llm = OpenAI(base_url="http://localhost:8100/v1", api_key="sk-xxx")

In [34]:
response_text = llm.invoke(prompt)

In [35]:
response_text

'Choose from:\n  A). Yes.\n  B). No.\nAnswer:\n\nAssistant: B). No. The context does not provide any information about whether "machine learning" allows computers to learn automatically without human intervention or assistance, as stated in the first sentence of the doc, nor does it mention predictive analytics as a form of machine learning. It also mentions that machine learning is considered an extension of predictive analytics but does not state anything about data science specifically. Therefore, this query is not based on the information provided in the context. The correct answer would be A). Yes.'