# Chromadb

In [3]:
import chromadb
import os

os.environ['ALLOW_RESET']="TRUE"

## 1. 内存建库

In [3]:
client = chromadb.Client()
collection = client.create_collection(name="my_collection")

### 1.1 自动保存文本 向量化 词语切分 建立索引

自动下载默认的文本向量化模型all-MiniLM-L6-v2 (~/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx)

In [5]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

### 1.2 内容检索

In [6]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)

print(results)

{'ids': [['id1', 'id2']], 'distances': [[0.7111214399337769, 1.0109773874282837]], 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]], 'embeddings': None, 'documents': [['This is a document', 'This is another document']], 'uris': None, 'data': None}


### 1.3 打印库信息

In [8]:
count = collection.count()
print(count)
print(collection)

2
name='my_collection' id=UUID('384e7665-e423-428a-b384-1d4b43334b68') metadata=None tenant='default_tenant' database='default_database'


## 2. 本地文件持久化建库

### 2.1 创建或者加载本地持久化库

In [5]:
clientlocal = chromadb.PersistentClient(path="/opt/Data/DBdata/chromadb.localfile")

### 2.2 数据集的创建、选择和删除

In [13]:
collection = clientlocal.create_collection(name="my_collection")

In [14]:
collection = clientlocal.get_collection(name="my_collection")

In [11]:
clientlocal.delete_collection(name="my_collection")

In [7]:
# returns a nanosecond heartbeat. Useful for making sure the client remains connected.
clientlocal.heartbeat()

1712818013109857176

### 2.3 文档导入到一个数据集中

In [7]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

Add of existing embedding ID: id1
Add of existing embedding ID: id2
Insert of existing embedding ID: id1
Insert of existing embedding ID: id2


In [10]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)

print(results)

{'ids': [['id1', 'id2']], 'distances': [[0.711121446165086, 1.010977382542355]], 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]], 'embeddings': None, 'documents': [['This is a document', 'This is another document']], 'uris': None, 'data': None}


<b>导入自带向量值文本</b>

In [None]:
collection.add(
    documents=["doc1", "doc2", "doc3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    ids=["id1", "id2", "id3", ...]
)

### 2.4 重置库

In [4]:
# Empties and completely resets the database. ⚠️ This is destructive and not reversible.
clientlocal.reset()

True

# 3 数据集查询

### 3.1 指定查询条件

In [None]:
# return the n_results closest matches to each query_embedding
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)

In [None]:
# first embed each query_text with the collection's embedding function, and then perform the query with the generated embedding.
collection.query(
    query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)

In [None]:
# retrieve items from a collection by id using .get
collection.get(
    ids=["id1", "id2", "id3", ...],
    where={"style": "style1"}
)

In [None]:
# supports the where and where_document filters
collection.get(
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)

### 3.2 指定返回数据项

<b>embeddings documents metadatas distances</b>

In [None]:
# Only get documents and ids
collection.get(
    include=[ "documents" ]
)

In [None]:
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
    include=[ "documents" ]
)

### 3.3 适用过滤器

In [None]:
# a value is not in predefined list (string, int, float, bool)

collection.query(
    query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    where={
            "metadata_field":{
                "$nin": ["value1", "value2", "value3"]
            }
          },
    where_document={"$contains":"search_string"}
)

## 4 数据集更新

依赖 ids 为条件更新

In [None]:
# If an id is not found in the collection, an error will be logged and the update will be ignored.

collection.update(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)

In [None]:
collection.upsert(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)

## 5 数据集的其它操作

In [15]:
# returns a list of the first 10 items in the collection
collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

### 5.1 查询数据项数目

In [17]:
# returns the number of items in the collection
collection.count()

0

In [20]:
# Rename the collection
print(collection)
collection.modify(name="new_name")
print(collection)

name='new_name' id=UUID('a38ebd7f-95cb-4223-99f6-e6d7a2e29bde') metadata=None tenant='default_tenant' database='default_database'
name='new_name' id=UUID('a38ebd7f-95cb-4223-99f6-e6d7a2e29bde') metadata=None tenant='default_tenant' database='default_database'


<b>指定向量距离计算函数</b>

In [21]:
# Valid options for hnsw:space are "l2", "ip, "or "cosine"

# Squared L2
# Inner product
# Cosine similarity

collectionex = clientlocal.create_collection(
    name="collection_name",
    metadata={"hnsw:space": "cosine"} # l2 is the default
)