In [1]:
import ollama
import chromadb
import os
import ollama
from bertopic import BERTopic
import itertools
from dataclasses import dataclass
from collections.abc import Generator

In [2]:
# MODEL = "llama3.2:1b nomic-embed-text"
MODEL = "nomic-embed-text:latest"

In [3]:
@dataclass
class Doc:
    id: int
    name: str
    path: str
    contents: str

In [4]:
def read_md_files(directory: str) -> Generator[Doc]:
    ignore = {".trash", ".archive", ".obsidian"}
    counter = itertools.count()

    for root, _, files in os.walk(directory):
        if not any(i in root for i in ignore):
            for file in files:
                if file.endswith(".md") and (not root.startswith(".")):
                    path = os.path.join(root, file)
                    with open(path, "r") as f:
                        if (contents := f.read().strip()) != "":
                            yield Doc(str(next(counter)), file, path, contents)


def generate_embeddings(docs: list[Doc]) -> list[float]:
    return ollama.embed(model=MODEL, input=[d.contents for d in docs]).embeddings

In [5]:
docs = list(read_md_files("/Users/j/Notes"))
len(docs)

1060

In [6]:
embeddings = generate_embeddings(docs)
len(embeddings)

1060

In [7]:
client = chromadb.Client()

try:
    client.delete_collection("docs")
except ValueError:
    pass

collection = client.create_collection(name="docs")

In [8]:
collection.add(
    ids=[d.id for d in docs],
    embeddings=embeddings,
    documents=[f"{d.path}\n{d.contents}" for d in docs],
)

In [9]:
assert collection.count() == len(embeddings)

In [10]:
len(set(tuple(e) for e in embeddings))

1059

In [11]:
def qq(input: str, n_results: int = 1):
    r = ollama.embed(MODEL, input)
    assert len(r.embeddings) == 1  # One question, one response
    results = collection.query(r.embeddings, n_results=n_results)
    print(results["distances"], "\n")
    return results


r = qq("Hello world", 3)

# for d in r["documents"][0]:
#     print(f"{d[0:200]} \n")

[[0.7730928659439087, 0.8901185989379883, 0.8913306593894958]] 

