In [1]:
import chromadb
from chromadb import Collection
from typing import Dict
import json
from os import path
client=chromadb.Client()    #创建一个内存向量数据库
collection: Collection=client.create_collection("test")     #创建一个collection
print(collection)

def print_dict(data:Dict, title:str=""):
    print(title if title else "")
    print(json.dumps(data, indent=4, ensure_ascii=False))
    print("")

name='test' id=UUID('ca88283c-8e82-4cc2-8c12-0629c9c0df7b') metadata=None tenant='default_tenant' database='default_database'


# 最简单的例子

In [None]:
collection.add(
    documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=["doc1", "doc2"], # unique for each doc
)

In [2]:
collection.add(
    documents=["小李飞刀在2023年2月掷出了第一把绝命飞刀", "小李飞刀在2023年5月又掷出了第二把和第三把飞刀"],
    metadatas=[{"source": "news1", "category":"knife"}, {"source": "news2", "category":"knife"}],
    ids=["knife1", "knife2_3"],
)

# 演示了Collection.query
1. 演示了where查询条件
2. 演示了where_documents的查询条件
3. 演示了逻辑表达式的查询条件

In [None]:
from xml.etree.ElementInclude import include

results1:Dict = collection.query(
    query_texts=["小李飞刀总共扔了几把飞刀"],
    n_results=3,
)

print_dict(results1, "演示query的例子")

results2:Dict=collection.query(query_texts="小李飞刀总共扔了几把飞刀", 
                               n_results=2,
                               where={"category": "knife"},             #限定元数据字段category=knife
                            )
print_dict(results2, "演示where的简写模式")

results3:Dict=collection.query(query_texts="小李飞刀总共扔了几把飞刀", 
                               n_results=2,
                               where={
                                   "category":{
                                       "$eq":"knife"
                                   }
                               }
                            )
print_dict(results3, "演示where的繁写模式（返回结果同简写模式），操作符包含$eq, $ne, $gt, $gte, $lt, $lte")

results4:Dict=collection.query(query_texts="小李飞刀总共扔了几把飞刀", 
                               n_results=2,
                               where_document={"$contains":"飞刀"}   #文档中包含"飞刀"
                            )
print_dict(results4, "演示where_documents的例子，where_documents的可用操作符包含 $contents, $not_contents")

results5:Dict=collection.query(query_texts="小李飞刀总共扔了几把飞刀", 
                               n_results=2,
                               where={
                                   "$or":[
                                           {"source":"news1"},
                                           {"source":"news2"}
                                    ]
                               }
                            )
print_dict(results5, "演示逻辑运算符（逻辑运算符包含 $and, $or, $in, $nin）")


# 更新文档的例子

In [None]:
collection.update(
    ids=['knife1'],
    documents=['小李飞刀在2023年2月掷出了第一把绝命飞刀，之后因为刀被人捡走导致陷入了无刀可用的尴尬']
)

result6:Dict=collection.query(query_texts="小李飞刀的第一把刀去哪里了？", n_results=1)
print_dict(result6, "更新后的新内容")

In [None]:
#get的例子
result7:Dict=collection.get(ids=['knife1'])
print_dict(result7, "get的例子")

# 数据持久化
1. 只要把类从Client改成PersistentClient就可以实现向量数据的持久化
2. 只有在创建实例是通过Settings.allow_result=True才可以执行数据库reset（reset后的数据库会被清空）

In [5]:
from uuid import uuid4
a=str(uuid4())
print(type(a))

<class 'str'>


In [6]:
import chromadb
from chromadb import Collection
from chromadb.config import Settings
from typing import Optional, Sequence, List, Dict
from uuid import uuid4
_COLLECTION_NAME:str="test2"

client2 = chromadb.PersistentClient(path="chroma_db1", settings=Settings(allow_reset=True))
collection_list:Sequence[Collection]=client2.list_collections()
for c in collection_list:
    print(c.name)
collection2:Optional[Collection]=None
temp_list=[t for t in collection_list if t.name==_COLLECTION_NAME]
if not temp_list:
    collection2=client2.create_collection(_COLLECTION_NAME)
    collection2.add(
        documents=["每年的4月份北方的天气都会迎来沙尘暴", "每年的4月份南方都会迎来回南天"],
        metadatas=[{"source": "north"}, {"source": "source"}], # filter on these!
        ids=[str(uuid4()), str(uuid4())], # unique for each doc
    )
else:
    collection2=client2.get_collection(_COLLECTION_NAME)

ret:Dict = collection2.query(query_texts=["北方4月份的天气怎么样？"],n_results=1)
print(ret)


test2
{'ids': [['9890ff29-a97a-405d-a104-0749e8cbf884']], 'distances': [[0.415538629507901]], 'metadatas': [[{'source': 'north'}]], 'embeddings': None, 'documents': [['每年的4月份北方的天气都会迎来沙尘暴']], 'uris': None, 'data': None}


# 嵌入函数 embedding_functions 和自定义嵌入函数 EmbeddingFunction
１. 演示了如何直接把一个字符串向量化，向量化之后会翻译一个浮点数列表
２. 演示了使用EmbeddingFunction的子类，自定义了一个随机数的字符串向量化类
３. 演示了使用EmbeddingFunction的子类，自定义了一个返回第一个例子向量结果的向量化类

In [29]:
from chromadb.utils import embedding_functions
from chromadb.api.types import Documents, EmbeddingFunction, Documents, Image, Document, Embeddings
from typing import List, cast
import numpy as np

#例子1:演示了如何直接把一个字符串向量化，向量化之后会翻译一个浮点数列表
ef = embedding_functions.DefaultEmbeddingFunction()
values=ef("张三丰是一个白胡子老头")
print(values)

#例子2:演示了使用EmbeddingFunction的子类，自定义了一个随机数的字符串向量化类
class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def random_embeddings(self) -> Embeddings:
        return cast(Embeddings, np.random.random(size=(10, 10)).tolist())
    def __call__(self, input: Documents) -> Embeddings:
        return self.random_embeddings()

mef=MyEmbeddingFunction()
temp=mef("张三丰是一个白胡子老头")
print(temp)

#例子3:演示了使用EmbeddingFunction的子类，自定义了一个返回第一个例子向量结果的向量化类
class MyEmbeddingFunction2(EmbeddingFunction[Documents]):
    def my_embedings_func(self)->Embeddings:
        return cast(Embeddings, values)
    def __call__(self, input: Documents) -> Embeddings:
        return self.my_embedings_func()

mef2=MyEmbeddingFunction2()
temp2=mef2("张三丰是一个白胡子老头")
print(temp2)


[[-0.041596341878175735, -0.02468179352581501, 0.07789314538240433, -0.006149571388959885, 0.017001433297991753, 0.0012621058849617839, 0.12373962253332138, -0.001971534453332424, 0.05898107588291168, -0.04216563329100609, 0.059197306632995605, -0.07129895687103271, 0.12670378386974335, 0.02515036053955555, -0.013801413588225842, -0.06401193886995316, -0.021355368196964264, -0.0236787311732769, -0.045675911009311676, 0.0074907648377120495, -0.08093090355396271, 0.016425346955657005, 0.040202002972364426, 0.0340726412832737, -0.0914752408862114, 0.04133275896310806, -0.0028537397738546133, -0.005185537971556187, 0.062337495386600494, -0.03986141458153725, 0.0168918427079916, 0.01918470300734043, 0.005939332768321037, -0.08973685652017593, 0.018257759511470795, 0.004333149641752243, 0.017571888864040375, -0.06444104015827179, -0.06925512850284576, 0.042006965726614, -0.060051560401916504, -0.03753312677145004, 0.056896988302469254, -0.06448101252317429, 0.05673789978027344, 0.01103919465