In [None]:
%cd ..
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv

load_dotenv()

In [None]:
from src.db import AISafetyGraph

g = AISafetyGraph()
nodes = g.get_nodes()

In [None]:
import json
from openai import OpenAI

client = OpenAI()
input_jsonl = "batchinput.jsonl"
with open(input_jsonl, "w", encoding="utf-8") as f:
    for i, node in enumerate(nodes):
        line = {
            "custom_id": f"req-{i}",
            "method": "POST",
            "url": "/v1/embeddings",
            "body": {
                "model": "text-embedding-3-small",
                "input": node["text"],
                "encoding_format": "float",
            },
        }
        f.write(json.dumps(line) + "\n")

In [None]:
uploaded = client.files.create(file=open(input_jsonl, "rb"), purpose="batch")

batch = client.batches.create(
    input_file_id=uploaded.id,
    endpoint="/v1/embeddings",
    completion_window="24h",
)
print(
    json.dumps(
        {"batch_id": batch.id, "status": batch.status, "input_file_id": uploaded.id}
    )
)

create_out = "embeddings/batch_create.json"
payload = batch.model_dump()
with open(create_out, "w", encoding="utf-8") as f:
    json.dump(payload, f)

In [None]:
from openai import OpenAI

client = OpenAI()

batch = client.batches.retrieve(batch.id)
print(batch.status)

In [None]:
from openai import OpenAI

client = OpenAI()

file_response = client.files.content(batch.output_file_id)
with open("embeddings/batch_out.jsonl", "w") as f:
    f.write(file_response.text)

In [None]:
from pathlib import Path
import json
import gzip
from typing import Iterator


def iter_jsonl(path: str | Path) -> Iterator[dict]:
    """Yield one dict per line. Supports .gz files."""
    path = Path(path)
    opener = gzip.open if path.suffix == ".gz" else open
    mode = "rt" if path.suffix == ".gz" else "r"
    with opener(path, mode, encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(f"{path}:{i}: {e}") from e


embs = []
for item in iter_jsonl("embeddings/batch_out.jsonl"):
    custom_id = item["custom_id"]
    emb = item["response"]["body"]["data"][0]["embedding"]
    embs.append(emb)
embs

In [None]:
import numpy as np
from usearch.index import Index

index = Index(ndim=1536)
embs = np.array(embs)
keys = [i for i in range(len(embs))]
vector = embs[0]
index.add(keys, embs)
matches = index.search(vector, 10, exact=True)

In [None]:
import numpy as np
from usearch.index import Index


def top_duplicate_pairs_with_texts(
    embs, nodes, K=30, top_n=30, metric="cos", dtype="f32", exact=False
):
    embs = np.asarray(embs, dtype=np.float32)
    n, d = embs.shape
    keys = np.arange(n, dtype=np.int64)

    index = Index(ndim=d, metric=metric, dtype=dtype)
    index.add(keys, embs)

    batch = index.search(embs, count=K + 1, exact=exact)

    best_pair_dist = {}
    for i in range(len(batch)):
        k = int(keys[i])
        m = batch[i]
        for nk, dist in zip(m.keys, m.distances):
            nk = int(nk)
            if nk == k:
                continue
            a, b = (k, nk) if k < nk else (nk, k)
            prev = best_pair_dist.get((a, b))
            if prev is None or dist < prev:
                best_pair_dist[(a, b)] = float(dist)

    # Sort and format output
    top = sorted(best_pair_dist.items(), key=lambda kv: kv[1])[:top_n]
    results = []
    for (a, b), d in top:
        results.append(
            {
                "id_a": a,
                "id_b": b,
                "node_a": nodes[a],
                "node_b": nodes[b],
                "distance": d,
                "similarity": 1.0 - d,
            }
        )
    return results


pairs = top_duplicate_pairs_with_texts(embs, nodes, K=40, top_n=30)
for r in pairs:
    print(
        f"[{r['id_a']}] {r['node_a']}\n[{r['id_b']}] {r['node_b']}\n"
        f"sim={r['similarity']:.4f} dist={r['distance']:.4f}\n"
    )
    res = g.merge_nodes(r["node_a"]["id"], r["node_b"]["id"])