In [3]:
from chromadb import PersistentClient
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

CHROMA_PATH = "./chroma"
COLLECTION_NAME = "data"

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="qwen3"
)

persistent_client = PersistentClient(
    path=CHROMA_PATH,
)

collection = persistent_client.get_or_create_collection(name=COLLECTION_NAME)

vector_store = Chroma(
    client=persistent_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
)

# Add documents to the vector store
def add_documents_to_vector_store(documents):
    vector_store.add_documents(
        documents=documents,
        ids=[doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in documents],
    )
    print(f"Added {len(documents)} documents to the vector store.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [1]:
import glob
from langchain_community.document_loaders import UnstructuredMarkdownLoader

def load_documents(path):
    documents = []

    for file in glob.glob(path, recursive=True):
        # Load markdown files from the specified directory
        loader = UnstructuredMarkdownLoader(
            file_path=file,
            mode='single',
            source_type='markdown'
        )
    
        markdown_docs = loader.load()

        # check frontmatter. see if draft = true. if true, skip.
        if markdown_docs[0].page_content.startswith('+++'):
            frontmatter_end = markdown_docs[0].page_content.find('+++', 3)
            if frontmatter_end != -1:
                frontmatter = markdown_docs[0].page_content[3:frontmatter_end].strip()
                if 'draft = true' in frontmatter:
                    print(f"Skipping draft document: {file}")
                    continue

        documents.extend(markdown_docs)

    # if page_content for any document is non-unique, remove duplicates
    unique_documents = {doc.page_content: doc for doc in documents}
    documents = list(unique_documents.values())

    for doc in documents:
        doc.metadata['source_type'] = 'markdown'
        doc.metadata['source'] = doc.metadata.get('source', 'unknown')
        doc.metadata['chunk_number'] = 1  # Assuming single chunk for markdown files
    
    return documents

markdown_documents = load_documents('./md_content/**/*.md')

Skipping draft document: ./md_content/web_content/resources.md
Skipping draft document: ./md_content/web_content/name-contest.md
Skipping draft document: ./md_content/web_content/post/2020-dcos-maintenance.md
Skipping draft document: ./md_content/web_content/post/2023-04-computational-scientist-position.md
Skipping draft document: ./md_content/web_content/post/2019-september-maintenance-notes.md
Skipping draft document: ./md_content/web_content/post/2019-september-scratch-notes.md
Skipping draft document: ./md_content/web_content/post/2018-fall-workshops.md
Skipping draft document: ./md_content/web_content/post/2018-spring-workshops.md
Skipping draft document: ./md_content/web_content/userinfo/resources.md
Skipping draft document: ./md_content/web_content/userinfo/use-cases.md
Skipping draft document: ./md_content/web_content/userinfo/lab-computing.md
Skipping draft document: ./md_content/web_content/userinfo/secure-computing.md
Skipping draft document: ./md_content/web_content/userinf

In [None]:
# USE THIS BLOCK TO REMOVE DRAFTS AND DUPLICATES

print(f"Loaded {len(markdown_documents)} markdown documents.")

# remove documents not in markdown_documents from collection

all_valid_documents = [doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in markdown_documents]
all_valid_documents_set = set(all_valid_documents)

documents = vector_store.get(include=["documents"])

to_remove = []
for doc, id in zip(documents["documents"], documents["ids"]):
    if id not in all_valid_documents_set:
        to_remove.append(id)
        print(f"Marked document with id {id} for removal from the vector store.")

print(f"Total documents to remove: {len(to_remove)}")
print(len(markdown_documents))
vector_store.delete(ids=to_remove)

Failed to send telemetry event CollectionDeleteEvent: capture() takes 1 positional argument but 3 were given


Loaded 937 markdown documents.
Marked document with id markdown_./md_content/web_content/resources.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/support.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/search.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/name-contest.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/thank-you.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/signup.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/post/2026-hpc-maintenance-dates.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/post/2024-september-17-open-house.md_1 for removal from the vector store.
Marked document with id markdown_./md_content/web_content/post/2

In [6]:
print('Number of Documents:', len(markdown_documents))

def doc_to_id(doc):
    return f"{doc.metadata['source_type']}_{doc.metadata['source']}_{doc.metadata['chunk_number']}"

def add_documents_to_vector_store(documents):
    existing_ids = vector_store.get()["ids"]
    existing_ids_set = set(existing_ids)
    print(f"Found {len(existing_ids_set)} existing document IDs in the vector store.")
    documents = [doc for doc in documents if doc_to_id(doc) not in existing_ids_set]
    print(f"Filtered {len(documents)} documents to add to the vector store.")
    if not documents:
        print("No new documents to add.")
        return
    print("Documents to add:", documents)
    vector_store.add_documents(
        documents=documents,
        ids=[doc_to_id(doc) for doc in documents],
    )
    print(f"Added {len(documents)} documents to the vector store")

# Add documents to the vector store in batches
BATCH_SIZE = 1
for i in range(0, len(markdown_documents), BATCH_SIZE):
    print(f"[Batch {i // BATCH_SIZE + 1} of {len(markdown_documents) // BATCH_SIZE + 1}]")
    batch = markdown_documents[i:i + BATCH_SIZE]
    add_documents_to_vector_store(batch)
    print()

Number of Documents: 1476
[Batch 1207 of 1477]
Found 1206 existing document IDs in the vector store.
Filtered 1 documents to add to the vector store.
Added 1 documents to the vector store

[Batch 1208 of 1477]
Found 1207 existing document IDs in the vector store.
Filtered 1 documents to add to the vector store.
Added 1 documents to the vector store

[Batch 1209 of 1477]
Found 1208 existing document IDs in the vector store.
Filtered 1 documents to add to the vector store.
Added 1 documents to the vector store

[Batch 1210 of 1477]
Found 1209 existing document IDs in the vector store.
Filtered 1 documents to add to the vector store.
Documents to add: [Document(metadata={'source': './md_content/learning_content/notes/hpc-from-terminal/section1.md', 'source_type': 'markdown', 'chunk_number': 1}, page_content='title: I - Intro to Unix and Bash Shell date: 2023-12-11T00:00:00-05:00 type: docs weight: 10 toc: true menu: hpc-from-terminal:\n\nUNIX\n\nUNIX is a text-oriented operating system (O