In [34]:
## Initialize a connection to the running Chroma DB server
import chromadb
import os

## Use the following line to connect from within CML
chroma_client = chromadb.Client() 

## Use the following line to connect from outside CML 
## USER TO DO: Set host with the URL to the application where the server is hosted CML--> Applications
## USER TO DO: Enable unauthenticated app access for the Application
HOST='https://cml-chroma-server-ysovdh.ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/'
chroma_client = chromadb.HttpClient(host=HOST)

In [35]:
from chromadb.utils import embedding_functions

EMBEDDING_MODEL_REPO = "sentence-transformers/all-mpnet-base-v2"
EMBEDDING_MODEL_NAME = "all-mpnet-base-v2"
EMBEDDING_FUNCTION = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)

COLLECTION_NAME = os.getenv('COLLECTION_NAME')

print("initialising Chroma DB connection...")

print(f"Getting '{COLLECTION_NAME}' as object...")
try:
    chroma_client.get_collection(name=COLLECTION_NAME, embedding_function=EMBEDDING_FUNCTION)
    print("Success")
    collection = chroma_client.get_collection(name=COLLECTION_NAME, embedding_function=EMBEDDING_FUNCTION)
except:
    print("Creating new collection...")
    collection = chroma_client.create_collection(name=COLLECTION_NAME, embedding_function=EMBEDDING_FUNCTION)
    print("Success")

# Get latest statistics from index
current_collection_stats = collection.count()
print('Total number of embeddings in Chroma DB index is ' + str(current_collection_stats))


initialising Chroma DB connection...
Getting 'cml-default' as object...
Success
Total number of embeddings in Chroma DB index is 14


In [36]:
## Sample add to Chroma vector DB
file_path = '/example/of/file/path/to/doc.txt'
classification = "public"
text = "This is a sample document which would represent content for a semantic search."

collection.add(
    documents=[text],
    metadatas=[{"classification": classification}],
    ids=[file_path]
)

In [37]:
## Query Chroma vector DB 
## This query returns the two most similar results from a semantic search
results = collection.query(
    query_texts=["What is Apache Iceberg?"],
    n_results=2
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
)
print(results)

{'ids': [['/home/cdsw/3_job-populate-sample-vectors/sample-data/iceberg/iceberg-snippet.txt', '/home/cdsw/3_job-populate-sample-vectors/sample-data/ozone/ozone-snippet.txt']], 'distances': [[1.1674057912671203, 1.400256050073526]], 'embeddings': None, 'metadatas': [[{'classification': 'public'}, {'classification': 'public'}]], 'documents': [["Apache Iceberg is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table.\n\nUser experience\nIceberg avoids unpleasant surprises. Schema evolution works and won't inadvertently un-delete data. Users don't need to know about partitioning to get fast queries.\n\nSchema evolution supports add, drop, update, or rename, and has no side-effects\nHidden partitioning prevents user mistakes that cause silently incorrect results or extremely slow queries\nPartition layout evolution can update the l

In [38]:
# Helper function to return the Knowledge Base doc based on Knowledge Base ID (relative file path)
def load_context_chunk_from_data(id_path):
    with open(id_path, "r") as f: # Open file in read mode
        return f.read()
    
## Clean up output and display full file
for i in range(len(results['ids'][0])):
    file_path = results['ids'][0][i]
    classification = results['metadatas'][0][i]['classification']
    document = results['documents'][0][i]
    
    print("------------- RESULT " + str(i+1) + " ----------------\n")
    print(f"FILE PATH: {file_path}")
    print(f"CLASSIFICATION: {classification}")
    print(f"DOCUMENT: {document}\n")
    print(f"FULL DOCUMENT (FROM FILE): {load_context_chunk_from_data(file_path)}\n")


------------- RESULT 1 ----------------

FILE PATH: /home/cdsw/3_job-populate-sample-vectors/sample-data/iceberg/iceberg-snippet.txt
CLASSIFICATION: public
DOCUMENT: Apache Iceberg is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table.

User experience
Iceberg avoids unpleasant surprises. Schema evolution works and won't inadvertently un-delete data. Users don't need to know about partitioning to get fast queries.

Schema evolution supports add, drop, update, or rename, and has no side-effects
Hidden partitioning prevents user mistakes that cause silently incorrect results or extremely slow queries
Partition layout evolution can update the layout of a table as data volume or query patterns change
Time travel enables reproducible queries that use exactly the same table snapshot, or lets users easily examine changes
Version r