In [11]:
# Build API repo JSONL (with & without QoS) from ToolBench (all categories) + WS-Dream

from pathlib import Path
import json, re, random
import pandas as pd
import numpy as np

# -------- Paths (adjust if needed) ----------
TOOLBENCH_ROOT = Path("/Users/ishwaryapns/Documents/Thesis/MAOF/data/raw/toolbench/data/toolenv/tools")

# WS-Dream matrices (txt/csv; delimiter auto-detected)
WSD_RT = Path("../raw/wsdream/dataset1/rtMatrix.txt")
WSD_TP = Path("../raw/wsdream/dataset1/tpMatrix.txt")

# Output folder and filenames
OUT_DIR = Path("../processed/api_catalog_sample10")
OUT_DIR.mkdir(parents=True, exist_ok=True)
NO_QOS_OUT = OUT_DIR / "api_repo.no_qos.jsonl"
WITH_QOS_OUT = OUT_DIR / "api_repo.with_qos.jsonl"

# Reproducible mapping of QoS columns to endpoints
random.seed(42)

print("ToolBench root:", TOOLBENCH_ROOT.resolve())
print("Output dir:", OUT_DIR.resolve())


ToolBench root: /Users/ishwaryapns/Documents/Thesis/MAOF/data/raw/toolbench/data/toolenv/tools
Output dir: /Users/ishwaryapns/Documents/Thesis/MAOF/data/processed/api_catalog_sample10


In [12]:
def read_matrix(path: Path) -> pd.DataFrame:
    """Auto-detect delimiter (comma/space/tab). No header."""
    return pd.read_csv(path, header=None, sep=None, engine="python")

def slug(text: str) -> str:
    s = re.sub(r"[^a-zA-Z0-9]+", "_", (text or "").strip().lower()).strip("_")
    return s[:64] if s else "x"

def safe_get(d, key, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

def yield_endpoints_from_tool(tool_obj: dict, tool_base: str, category: str, file_name: str):
    api_list = safe_get(tool_obj, "api_list", [])
    if not isinstance(api_list, list):
        return
    for ep in api_list:
        if not isinstance(ep, dict):
            continue
        name = safe_get(ep, "name") or ""
        url = safe_get(ep, "url") or ""
        method = safe_get(ep, "method") or ""
        desc = safe_get(ep, "description") or ""

        ep_id = f"{tool_base}_{slug(name) or 'endpoint'}"

        yield {
            "api_id": ep_id,
            "category": category,
            "name": name,
            "description": desc,
            "method": method,
            "url": url,
            "_file": file_name,
            "_tool": tool_base,
        }

def iter_endpoints_any_json(json_path: Path, category: str):
    try:
        data = json.loads(json_path.read_text())
    except Exception:
        return

    file_stem = json_path.stem
    tool_base_default = slug(file_stem)

    if isinstance(data, dict) and isinstance(data.get("api_list"), list):
        tool_base = safe_get(data, "standardized_name") or tool_base_default
        yield from yield_endpoints_from_tool(data, tool_base, category, json_path.name)
        return

    if isinstance(data, dict) and isinstance(data.get("tools"), list):
        for t in data["tools"]:
            if not isinstance(t, dict): 
                continue
            tool_base = safe_get(t, "standardized_name") or safe_get(t, "name") or tool_base_default
            yield from yield_endpoints_from_tool(t, slug(tool_base), category, json_path.name)
        return

    if isinstance(data, dict) and isinstance(data.get("endpoints"), list):
        tool_like = {
            "api_list": data["endpoints"],
            "standardized_name": safe_get(data, "standardized_name") or tool_base_default
        }
        yield from yield_endpoints_from_tool(tool_like, slug(tool_like["standardized_name"]), category, json_path.name)
        return

    if isinstance(data, list):
        for idx, item in enumerate(data):
            if isinstance(item, dict) and isinstance(item.get("api_list"), list):
                tool_base = safe_get(item, "standardized_name") or safe_get(item, "name") or f"{tool_base_default}_{idx}"
                yield from yield_endpoints_from_tool(item, slug(tool_base), category, json_path.name)
        return

def compute_qos_per_column(rt_col: pd.Series, tp_col: pd.Series) -> dict:
    """
    Compute median RT/TP and availability. If no valid values, return nulls and 0 availability.
    """
    rt_valid = rt_col.replace(-1, np.nan).dropna()
    tp_valid = tp_col.replace(-1, np.nan).dropna()
    availability = len(rt_valid) / len(rt_col) if len(rt_col) > 0 else 0.0

    return {
        "rt_ms": float(np.median(rt_valid)) if not rt_valid.empty else None,
        "tp_rps": float(np.median(tp_valid)) if not tp_valid.empty else None,
        "availability": round(availability, 4),
        "valid_qos": not (rt_valid.empty or tp_valid.empty)
    }



In [13]:
# Walk all category folders and collect endpoints
all_endpoints = []
category_dirs = sorted([p for p in TOOLBENCH_ROOT.iterdir() if p.is_dir()])

for cat_dir in category_dirs:
    category = cat_dir.name
    for jf in sorted(cat_dir.glob("*.json")):
        for ep in iter_endpoints_any_json(jf, category):
            all_endpoints.append(ep)

print(f"Total endpoints collected across ALL categories: {len(all_endpoints)}")
if not all_endpoints:
    raise SystemExit("No endpoints found. Check TOOLBENCH_ROOT path or JSON shapes.")


Total endpoints collected across ALL categories: 5753


In [14]:
seen = {}
unique_endpoints = []
for ep in all_endpoints:
    api_id = ep["api_id"]
    if api_id not in seen:
        seen[api_id] = 1
        unique_endpoints.append(ep)
    else:
        seen[api_id] += 1
        new_id = f"{api_id}-{seen[api_id]}"
        ep = {**ep, "api_id": new_id}
        unique_endpoints.append(ep)

print(f"Unique endpoints after ID de-duplication: {len(unique_endpoints)}")


Unique endpoints after ID de-duplication: 5753


In [15]:
rt_df = read_matrix(WSD_RT)
tp_df = read_matrix(WSD_TP)

if rt_df.shape != tp_df.shape:
    raise ValueError(f"RT/TP shape mismatch: {rt_df.shape} vs {tp_df.shape}")

n_users, n_cols = rt_df.shape
print(f"WS-Dream matrix: {n_users} users × {n_cols} API columns")


WS-Dream matrix: 339 users × 5826 API columns


In [16]:
col_indices = list(range(n_cols))
random.shuffle(col_indices)

assigned = 0
skipped = 0

with NO_QOS_OUT.open("w", encoding="utf-8") as no_qos_f, \
     WITH_QOS_OUT.open("w", encoding="utf-8") as with_qos_f:

    for i, ep in enumerate(unique_endpoints):
        base = {k: v for k, v in ep.items() if not k.startswith("_")}
        no_qos_f.write(json.dumps(base, ensure_ascii=False) + "\n")

        if i < len(col_indices):
            c = col_indices[i]
            qos = compute_qos_per_column(rt_df[c], tp_df[c])
            with_qos_f.write(json.dumps({**base, "qos": qos}, ensure_ascii=False) + "\n")
            assigned += 1
        else:
            skipped += 1

print(f"Wrote: {NO_QOS_OUT.name}  (all {len(unique_endpoints)} endpoints)")
print(f"Wrote: {WITH_QOS_OUT.name} ({assigned} endpoints with QoS attached)")
if skipped:
    print(f"Endpoints without QoS (no column left or invalid column): {skipped}")


Wrote: api_repo.no_qos.jsonl  (all 5753 endpoints)
Wrote: api_repo.with_qos.jsonl (5753 endpoints with QoS attached)


In [17]:
no_qos_lines = sum(1 for _ in NO_QOS_OUT.open())
with_qos_lines = sum(1 for _ in WITH_QOS_OUT.open())
print("no_qos.jsonl lines:", no_qos_lines)
print("with_qos.jsonl lines:", with_qos_lines)

print("\nSample no_qos:")
for i, line in enumerate(NO_QOS_OUT.open()):
    if i == 3: break
    print(line.strip())

print("\nSample with_qos:")
for i, line in enumerate(WITH_QOS_OUT.open()):
    if i == 3: break
    print(line.strip())


no_qos.jsonl lines: 5753
with_qos.jsonl lines: 5753

Sample no_qos:
{"api_id": "1_cent_sms_sendsms", "category": "Communication", "name": "SendSMS", "description": "Send an SMS to the USA or Canada for 1 cent, the payload should be something like;\n\n{\n\t\"text\": \"Your Authentication code is 14456\",\n\t\"phone\": \"+17047037094\"\n}", "method": "POST", "url": "https://1-cent-sms.p.rapidapi.com/default/SMSLambda"}
{"api_id": "2factor_authentication_india_send_transactional_sms", "category": "Communication", "name": "Send Transactional SMS", "description": "Send Single / Bulk Transactional Messages / Schedule SMS", "method": "POST", "url": "https://2factor.p.rapidapi.com/API//V1/293832-67745-11e5-88de-5600000c6b13/ADDON_SERVICES/SEND/TSMS"}
{"api_id": "31events_send_native_calendar_invites_accountcreate", "category": "Communication", "name": "AccountCreate", "description": "", "method": "POST", "url": "https://pinke01-31events-auth.p.rapidapi.com/account"}

Sample with_qos:
{"api_id"