#### A - Primeros pasos

In [1]:
!pip install chromadb -q


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import chromadb

In [6]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [45]:
chroma_client.delete_collection(name='taller')
chroma_client.delete_collection(name='PRUEBA')


In [36]:
chroma_client.list_collections()

[]

In [46]:
collection = chroma_client.create_collection(name='taller')

In [38]:
chroma_client.get_collection(name='taller')
chroma_client.get_collection(name='PRUEBA')


Exception: {"error":"ValueError('Collection PRUEBA does not exist.')"}

#### B - Cargar Texto

In [39]:
collection.add(
    documents=["Primer documento", "Segundo documento", "El Málaga CF ganó la copa en el 2002"],
    metadatas=[{"doc": "teatro"}, {"doc": "cine"}, {"doc": "futbol"}],
    ids=["id1", "id2", "id3"]
)

In [40]:
print(collection.peek()) # returns a list of the first 10 items in the collection
print(collection.count()) # returns the number of items in the collection
print(collection.modify(name="taller")) # Rename the collection
print(chroma_client.get_collection(name='taller'))

{'ids': ['id1', 'id2', 'id3'], 'embeddings': [[-0.07100208848714828, 0.0864335149526596, -0.03650020435452461, -0.028054093942046165, 0.0598457008600235, -0.015228969044983387, -0.003504471853375435, 0.1234140619635582, -0.02542947418987751, 0.06278148293495178, 0.010045631788671017, 0.07129251211881638, -0.019688034430146217, -0.012209534645080566, -0.0069046118296682835, 0.018238356336951256, -0.0030092583037912846, 0.0042021917179226875, -0.0394665002822876, 0.0026571250054985285, 0.021848032251000404, 0.0558757409453392, -0.02697009965777397, 0.05611589178442955, -0.01291851606220007, -0.0002063655701931566, -0.04772914573550224, 0.017662392929196358, 0.021142028272151947, -0.06309319287538528, 0.08110305666923523, 0.0796845480799675, 0.09326440095901489, 0.015424535609781742, 0.09473825246095657, -0.014563458040356636, 0.013671713881194592, -0.014901211485266685, 0.005987653974443674, 0.008646626956760883, 0.0774231031537056, -0.11306338757276535, -0.08445456624031067, -0.06457619

In [41]:
collection

Collection(name=taller)

In [42]:
results_2 = collection.query(
    query_texts=["estoy triste"],
    n_results=3
)
results_2

{'ids': [['id2', 'id3', 'id1']],
 'distances': [[1.2441047027564396, 1.2855822376881763, 1.5828072451572652]],
 'embeddings': None,
 'metadatas': [[{'doc': 'cine'}, {'doc': 'futbol'}, {'doc': 'teatro'}]],
 'documents': [['Segundo documento',
   'El Málaga CF ganó la copa en el 2002',
   'Primer documento']]}

In [47]:
collection_prueba = chroma_client.create_collection(
        name="PRUEBA",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )
collection_prueba.add(
    documents=["Primer documento", "Segundo documento", "El Málaga CF ganó la copa en el 2002"],
    metadatas=[{"doc": "teatro", "tema": "documnenmts"}, {"doc": "cine","tema": "documnents"}, {"doc": "futbol","tema": "historia"}],
    ids=["id1", "id2", "id3"]
)

ValueError: Number of metadatas 4 must match number of ids 3

In [15]:
results_2 = collection_prueba.query(
    query_texts=["p documento"],
    n_results=3
)
results_2

{'ids': [['id1', 'id2', 'id3']],
 'distances': [[0.2938711451220084, 0.4104362479047716, 0.8085735880457704]],
 'embeddings': None,
 'metadatas': [[{'doc': 'teatro'}, {'doc': 'cine'}, {'doc': 'futbol'}]],
 'documents': [['Primer documento',
   'Segundo documento',
   'El Málaga CF ganó la copa en el 2002']]}

In [32]:
# query con where 
results_2 = collection_prueba.query(
    query_texts=["estoy triste", "me gusta el futbol"],
    n_results=3,
    where={"doc": "futbol"},
    where={"id": "id1"},
    where_document={"$contains":"p"}
)
results_2

SyntaxError: keyword argument repeated: where (1915477663.py, line 6)

In [21]:
results_2 = collection_prueba.query(
    query_texts=["estoy triste"],
    n_results=3
)
results_2

{'ids': [['id2', 'id3', 'id1']],
 'distances': [[0.6220523502670283, 0.6427911175924699, 0.791403622744148]],
 'embeddings': None,
 'metadatas': [[{'doc': 'cine'}, {'doc': 'futbol'}, {'doc': 'teatro'}]],
 'documents': [['Segundo documento',
   'El Málaga CF ganó la copa en el 2002',
   'Primer documento']]}

#### C - Buscar Texto

In [28]:
results = collection.query(
    query_texts=["p documento"],
    n_results=3
)
results

{'ids': [['id1', 'id2', 'id3']],
 'distances': [[0.5877422917882167, 0.8208724996042835, 1.6171471838273976]],
 'embeddings': None,
 'metadatas': [[{'doc': 'teatro'}, {'doc': 'cine'}, {'doc': 'futbol'}]],
 'documents': [['Primer documento',
   'Segundo documento',
   'El Málaga CF ganó la copa en el 2002']]}

#### D - Embedding

In [12]:
from chromadb.utils import embedding_functions

In [13]:
embedding_functions.DefaultEmbeddingFunction()

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x7f49f8d55930>

In [15]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers<5.0.0,>=4.6.0->sentence_transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━

In [None]:
sentence_embedding = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )

In [17]:
embedding_vector = sentence_embedding(["La plaza del Zócalo es la plaza principal de la ciudad"])

In [21]:
collection.add(
    embeddings=embedding_vector,
    documents=["La plaza del Zócalo es la plaza principal de la ciudad"],
    metadatas=[{"doc": "turismo"}],
    ids=["id4"]
)

In [22]:
new_collection = chroma_client.create_collection(
    name="prueba-embedding",
    embedding_function=sentence_embedding
)

In [25]:
chroma_client.list_collections()

[Collection(name=prueba-embedding)]

#### C - Borrar

In [13]:
chroma_client.delete_collection(name='taller')