In [23]:
import chromadb
from transformers import AutoTokenizer, AutoModel
import torch

In [26]:
class AlibabaEmbeddingFunction:
    def __init__(self, model_name="Alibaba-NLP/gte-multilingual-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def __call__(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

chroma_client = chromadb.Client()

In [27]:
embedding_fn = AlibabaEmbeddingFunction()
client = ChromaClient(embedding_function=embedding_fn)
collection = client.create_collection("example_collection")

# Добавление данных
collection.add(
    documents=["Сергей решил уехать из страны", "Билл Клинтнон хотел эту женщину"],
    ids=["id1", "id2"]
)

ValueError: Loading Alibaba-NLP/gte-multilingual-base requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [17]:
results = collection.query(
    query_texts=["Найди документ про Россию"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)


{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['Сергея посадили в тюрьму потому что он гей', 'Алиса в стане чудес']], 'uris': None, 'data': None, 'metadatas': [[None, None]], 'distances': [[0.6008076667785645, 0.6890753507614136]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


collection = chroma_client.create_collection(name="my_collection")

results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)
