In [1]:
import os
import json
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = os.getcwd()
DB_PATH = os.path.join(BASE_DIR, "chroma_db")

POLICIES_FILE = os.path.join(BASE_DIR, "policies_atomi.json")
PRODUCTS_FILE = os.path.join(BASE_DIR, "products_atomi.json")

COLLECTION_NAME = "atomi_knowledge"
PRODUCTS_FILE

'/Users/juandiego/Documents/study/keepcoding/Final Proyect/chromadb/products_atomi.json'

In [3]:
embed_model = SentenceTransformer("BAAI/bge-m3")

Loading weights: 100%|██████████| 391/391 [00:00<00:00, 2528.70it/s, Materializing param=pooler.dense.weight]                               


In [4]:
client = chromadb.PersistentClient(path=DB_PATH)

try:
    client.delete_collection(COLLECTION_NAME)
except:
    pass

collection = client.get_or_create_collection(COLLECTION_NAME)

In [5]:
with open(POLICIES_FILE, "r", encoding="utf-8") as f:
    policies = json.load(f)

policy_texts = []
policy_ids = []
policy_metadata = []

for policy in policies:
    text = f"""
    Tipo: Política
    Título: {policy['title']}
    Contenido: {policy['content']}
    Categoría: {policy['metadata'].get('category', 'general')}
    País: {policy['metadata'].get('country', 'Colombia')}
    """

    policy_texts.append(text.strip())
    policy_ids.append(policy["id"])
    policy_metadata.append({
        "type": "policy",
        "category": policy["metadata"].get("category", "general")
    })

policy_embeddings = embed_model.encode(policy_texts).tolist()

collection.add(
    documents=policy_texts,
    embeddings=policy_embeddings,
    ids=policy_ids,
    metadatas=policy_metadata
)

In [None]:
with open(PRODUCTS_FILE, "r", encoding="utf-8") as f:
    products = json.load(f)

product_texts = []
product_ids = []
product_metadata = []

for i, product in enumerate(products):
    text = f"""
    Tipo: Producto
    Nombre: {product['name']}
    Precio: {product['price']:,.0f}.000 COP
    Descripción: {product['description']}
    """

    product_texts.append(text.strip())
    product_ids.append(f"product_{i}")
    product_metadata.append({
        "type": "product",
        "price": product["price"]
    })

product_embeddings = embed_model.encode(product_texts).tolist()

collection.add(
    documents=product_texts,
    embeddings=product_embeddings,
    ids=product_ids,
    metadatas=product_metadata
)

In [7]:

print(DB_PATH)
print(f"Colecciones: {[c.name for c in client.list_collections()]}")
print(f"Total de documentos en '{COLLECTION_NAME}': {collection.count()}")

/Users/juandiego/Documents/study/keepcoding/Final Proyect/chromadb/chroma_db
Colecciones: ['atomi_knowledge']
Total de documentos en 'atomi_knowledge': 59
