In [83]:
import chromadb

from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction, DefaultEmbeddingFunction


In [2]:

chroma_client=chromadb.Client()


# client创建/删除/提取collection

## creat_collection

如果已存在这个name的collection，会报错。


In [3]:
collection1=chroma_client.create_collection(name='test1')
collection2=chroma_client.create_collection(name='test2')
collection3=chroma_client.create_collection(name='test3')


## list_collections()


client的所有collection，并输出他们的names。collection的结构。

In [10]:
collection_total = chroma_client.list_collections()
print(collection_total)

print(collection_total[0])
print(collection_total[0].name)
print(collection_total[0].id)
print(collection_total[0].metadata)

collection_names = [collection.name for collection in collection_total]
print(collection_names)

[Collection(name=test2), Collection(name=test1), Collection(name=test3)]
name='test2' id=UUID('a814bafe-9094-410e-8d01-a7add573df73') metadata=None tenant='default_tenant' database='default_database'
test2
a814bafe-9094-410e-8d01-a7add573df73
None
['test2', 'test1', 'test3']


## get/get_or_create/delete


In [11]:
collection_test1 = chroma_client.get_collection(name='test1')
print(collection_test1)

name='test1' id=UUID('b6c8a2f1-d71f-40ed-8dc8-d093b6d81853') metadata=None tenant='default_tenant' database='default_database'


担心用create_collection回创建重复的collection；或者用get_collection回提取不存在的collection，则可以用get_or_create_collection。

In [12]:
collection_test4 = chroma_client.get_or_create_collection(name='test4')
print(collection_test4)


name='test4' id=UUID('5f79e41b-0a5e-44ba-a9b7-a00e38c4d530') metadata=None tenant='default_tenant' database='default_database'


删除collection，但如果这个collection本来就不存在，delete会报错。

In [23]:
coll = chroma_client.get_or_create_collection(name='test4') 
chroma_client.delete_collection(name='test4')

print(chroma_client.list_collections())

[Collection(name=test2), Collection(name=test1), Collection(name=test3)]


## 设置collection的embedding_function


In [33]:

emb_func = SentenceTransformerEmbeddingFunction()


collection=chroma_client.get_or_create_collection(name='test2',embedding_function=emb_func)

print(collection)

name='test2' id=UUID('a814bafe-9094-410e-8d01-a7add573df73') metadata=None tenant='default_tenant' database='default_database'


In [40]:
list_coll=chroma_client.list_collections()
names=[coll.name for coll in list_coll]
print(names)

if 'test4' in names:
    chroma_client.delete_collection(name='test4')
                                    
chroma_client.create_collection(name='test4',embedding_function=emb_func)

coll=chroma_client.get_collection(name='test4')
print(coll)


['test4', 'test2', 'test1', 'test3']
name='test4' id=UUID('ddeabe61-28a6-44c1-aa7b-2f90641b7473') metadata=None tenant='default_tenant' database='default_database'


## 修改coll的距离函数

collection允许用户自行切换距离计算函数，方法是通过设置cellection的metadata中的“hnsw:space”：

chroma默认的距离计算是squared L2公式：（用’l2‘调用）
$d = \sum\left(A_i-B_i\right)^2$

Inner product公式：（用‘ip’调用）
$d = 1.0 - \sum\left(A_i \times B_i\right) $

Cosine similarity的公式：（用‘consine’参数调用）
$d = 1.0 - \frac{\sum\left(A_i \times B_i\right)}{\sqrt{\sum\left(A_i^2\right)} \cdot \sqrt{\sum\left(B_i^2\right)}}$


In [51]:

list_coll=chroma_client.list_collections()
names=[coll.name for coll in list_coll]
if 'test5' in names:
    chroma_client.delete_collection(name='test5')
print(chroma_client.list_collections())

collection = chroma_client.create_collection(
      name="test5",
      # metadata={"hnsw:space": "cosine"} # l2 is the default
      # metadata={"hnsw:space": "ip"} # l2 is the default
      metadata={"hnsw:space": "l2"} # l2 is the default
  )

print(chroma_client.list_collections())
print(collection)

[Collection(name=test2), Collection(name=test1), Collection(name=test3), Collection(name=test4)]
[Collection(name=test5), Collection(name=test2), Collection(name=test1), Collection(name=test3), Collection(name=test4)]
name='test5' id=UUID('80f5877c-cb3d-4d2d-9cd6-55be890d15e5') metadata={'hnsw:space': 'l2'} tenant='default_tenant' database='default_database'


# collection基本操作

## add
collection.add() 添加documents，ids是文档的唯一ID，不能重复。

embeddings（可选）: 如果不传该参数，将根据Collection设置的embedding_function进行计算。  
metadatas（可选）：要与嵌入关联的元数据。在查询时，您可以根据这些元数据进行过滤。  
documents（可选）：与该嵌入相关联的文档，甚至可以不放文档。  


In [54]:
emb_func = SentenceTransformerEmbeddingFunction()


collection=chroma_client.get_or_create_collection(name='test1',embedding_function=emb_func)
print(collection)

collection.add(
    documents=["This is a document about engineer", "This is a document about steak"],
    metadatas=[{"source": "doc1"}, {"source": "doc2"}],
    ids=["id1", "id2"]
)

collection.metadata={"hnsw:space": "ip"}
print(collection)


Add of existing embedding ID: id1
Add of existing embedding ID: id2
Insert of existing embedding ID: id1
Insert of existing embedding ID: id2


name='test1' id=UUID('b6c8a2f1-d71f-40ed-8dc8-d093b6d81853') metadata={'hnsw:space': 'cosine'} tenant='default_tenant' database='default_database'
name='test1' id=UUID('b6c8a2f1-d71f-40ed-8dc8-d093b6d81853') metadata={'hnsw:space': 'ip'} tenant='default_tenant' database='default_database'


## get

In [55]:
collection.get(ids=['id1'])


{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [{'source': 'doc1'}],
 'documents': ['This is a document about engineer'],
 'uris': None,
 'data': None}

In [None]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id3", "id4"]
)

## update修改items

如果update的id不存在，对coll不更新。

In [72]:
collection.update(
        documents=["id3id3id3", "id4id4id4"],
    metadatas=[{"source": "id3id3"}, {"source": "id4id4"}],
    ids=["id6", "id7"]
)
collection.get()
    

Update of nonexisting embedding ID: id6
Update of nonexisting embedding ID: id7
Update of nonexisting embedding ID: id6
Update of nonexisting embedding ID: id7


{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'metadatas': [{'source': 'id1'},
  {'source': 'doc2'},
  {'source': 'id3id3'},
  {'source': 'id4id4'}],
 'documents': ['id1',
  'This is a document about steak',
  'id3id3id3',
  'id4id4id4'],
 'uris': None,
 'data': None}

In [64]:
collection.update(
    documents=["id3id3id3", "id4id4id4"],
    metadatas=[{"source": "id3id3"}, {"source": "id4id4"}],
    ids=["id3", "id4"]
)

In [65]:
collection.get()


{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'metadatas': [{'source': 'doc1'},
  {'source': 'doc2'},
  {'source': 'id3id3'},
  {'source': 'id4id4'}],
 'documents': ['This is a document about engineer',
  'This is a document about steak',
  'id3id3id3',
  'id4id4id4'],
 'uris': None,
 'data': None}

## delete



In [74]:
collection.delete(ids=['id1'])


In [75]:
collection.get()


{'ids': ['id2', 'id3', 'id4'],
 'embeddings': None,
 'metadatas': [{'source': 'doc2'}, {'source': 'id3id3'}, {'source': 'id4id4'}],
 'documents': ['This is a document about steak', 'id3id3id3', 'id4id4id4'],
 'uris': None,
 'data': None}

## upsert

更新item，如果不存在就创建。


In [76]:
collection.upsert(
    documents=["id1"],
    metadatas=[{"source": "id1"}],
    ids=["id1"]
)
collection.get()

{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'metadatas': [{'source': 'id1'},
  {'source': 'doc2'},
  {'source': 'id3id3'},
  {'source': 'id4id4'}],
 'documents': ['id1',
  'This is a document about steak',
  'id3id3id3',
  'id4id4id4'],
 'uris': None,
 'data': None}

## embedding_func



In [None]:

default_emb_func=DefaultEmbeddingFunction()
embedding_function = SentenceTransformerEmbeddingFunction()


ans=default_emb_func(['hi'])
print(type(ans))
print(ans)
print(len(ans[0])) # 384


## query查询


In [80]:
results = collection.query(
    query_texts=["今天天气好吗？"],
    n_results=2
)

print(results)


{'ids': [['id2', 'id3']], 'distances': [[1.7640920877456665, 1.9583094120025635]], 'metadatas': [[{'source': 'doc2'}, {'source': 'id3id3'}]], 'embeddings': None, 'documents': [['This is a document about steak', 'id3id3id3']], 'uris': None, 'data': None}


或者使用query_embeddings查询
