In [None]:
from databricks.sdk import WorkspaceClient
from tqdm.auto import tqdm
import json
import time

_w = WorkspaceClient(profile="FREE")

TABLE_FQN = "saas_billing_analytics.prod.billing_tenant_monthly_text"


def embed_one(text: str) -> list[float]:
    resp = _w.serving_endpoints.query(
        name="hack-embedder",
        input=[text],
    )
    body = resp.as_dict() if hasattr(resp, "as_dict") else resp

    if isinstance(body, dict) and "embeddings" in body:
        return body["embeddings"][0]
    if "data" in body and body["data"] and "embedding" in body["data"][0]:
        return body["data"][0]["embedding"]
    if "vectors" in body:
        return body["vectors"][0]
    if isinstance(body, list) and body and isinstance(body[0], (list, tuple)):
        return body[0]

    print("Unexpected:", json.dumps(body, indent=2))
    raise ValueError("Unexpected response format")


df = (
    spark.table(TABLE_FQN)
         .where("embedding IS NULL")
         .select("chunk_id", "text")
)

pdf = df.toPandas()
print(f"rows to embed: {len(pdf)}")

rows = []
for _, row in tqdm(pdf.iterrows(), total=len(pdf), desc="Embedding rows"):
    emb = embed_one(row["text"])
    rows.append({"chunk_id": int(row["chunk_id"]), "embedding": emb})
    time.sleep(0.1)  # rate limiting/안정용 살짝 delay

sdf = spark.createDataFrame(rows)
sdf.createOrReplaceTempView("billing_tenant_embeds_onebyone")

spark.sql(f"""
MERGE INTO {TABLE_FQN} AS t
USING billing_tenant_embeds_onebyone AS s
ON t.chunk_id = s.chunk_id
WHEN MATCHED THEN UPDATE SET t.embedding = s.embedding
""")

print("✅ Done")
