In [1]:
pip install chromadb

Collecting chromadb
  Using cached chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-macosx_10_9_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Using cached fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting numpy>=1.22.5 (from chromadb)
  Using cached numpy-2.2.3-cp311-cp311-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.18.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-

In [None]:
#EJEMPLO

import chromadb
import requests
import json
from chromadb.api.types import EmbeddingFunction

# Crear una clase de embeddings que use Ollama correctamente
class OllamaEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input):
        embeddings = []
        for text in input:  # Procesar cada texto individualmente
            response = requests.post(
                "http://localhost:11434/api/embeddings",  # Endpoint correcto
                json={"model": "nomic-embed-text", "prompt": text}  # Enviar texto individualmente
            )
            response_json = response.json()
            
            if "embedding" in response_json:  # Ojo: aquí es 'embedding' y no 'embeddings'
                embeddings.append(response_json["embedding"])  # Guardar el embedding
            else:
                raise ValueError(f"Error en la respuesta de Ollama: {response_json}")
        
        return embeddings  # Retornar lista de embeddings

# Instanciar la función de embeddings de Ollama
embedding_function = OllamaEmbeddingFunction()

# Crear cliente de ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Crear la colección con la nueva función de embeddings
collection = chroma_client.get_or_create_collection(
    name="my_collection",
    embedding_function=embedding_function
)

# Insertar documentos
collection.upsert(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
        "This is a query document about Florida"
    ],
    ids=["id1", "id2", "id3"]
)

# Consultar
query_text = "This is a query document about Florida"
results = collection.query(query_texts=[query_text], n_results=1)

print(results)


In [17]:
import chromadb
import requests
import json
from chromadb.api.types import EmbeddingFunction

# Crear la función de embeddings usando Ollama
class OllamaEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input):
        embeddings = []
        for text in input:
            response = requests.post(
                "http://localhost:11434/api/embeddings",  # Endpoint correcto para embeddings
                json={"model": "nomic-embed-text", "prompt": text}  # Enviar un solo texto a la vez
            )
            response_json = response.json()
            
            if "embedding" in response_json:
                embeddings.append(response_json["embedding"])  # Guardar el embedding
            else:
                raise ValueError(f"Error en la respuesta de Ollama: {response_json}")
        
        return embeddings  # Retorna una lista de embeddings

# Instanciar la función de embeddings con Ollama
embedding_function = OllamaEmbeddingFunction()


In [18]:
# Crear conexión con ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Crear la colección con Ollama como generador de embeddings
collection = chroma_client.get_or_create_collection(
    name="markdown_collection",
    embedding_function=embedding_function  # Pasamos la función de embeddings personalizada
)


In [19]:
import re

def load_markdown_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def chunk_text(text, chunk_size=300):
    sentences = re.split(r'\n\n+', text)  # Separar por párrafos (mejor para Markdown)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        current_length += len(sentence)
        current_chunk.append(sentence)

        if current_length >= chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


In [22]:
# Leer el archivo Markdown
file_path = "/Users/jiabowang/Desktop/dgsi/DGSI/reto2/pagina.md"
text = load_markdown_file(file_path)

# Dividir el texto en chunks
chunks = chunk_text(text, chunk_size=300)

# Insertar los chunks en ChromaDB
collection.upsert(
    documents=chunks,
    ids=[f"chunk_{i}" for i in range(len(chunks))]
)

print(f"Se han insertado {len(chunks)} chunks en ChromaDB.")


Se han insertado 90 chunks en ChromaDB.


In [25]:
query_text = "FIB"
results = collection.query(query_texts=[query_text], n_results=1)

print(results)


{'ids': [['chunk_88']], 'embeddings': None, 'documents': [['[ ![](/sites/fib/files/images/banner-suport-fib.jpg)\n](https://peticions.utgcntic.upc.edu/tiquetspeticions/control/main?idEmpresa=103958) ##  Contacta amb la FIB El vostre nom  * La vostra adreça electrònica  * Assumpte  * Categoria  *  \\- Trieu -  Aules, equips i serveis informàtics  Bústia de\nsuggeriments  Felicitacions  Informació Acadèmica  Informació de mobilitat\nInformació dels màsters  Informació general de la FIB  Notícies al web de la\nFIB  Queixes']], 'uris': None, 'data': None, 'metadatas': [[None]], 'distances': [[352.882555181602]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
