In [15]:
# Cell 1: Setup and Paths
# Defines paths to ToolBench data and WS-DREAM QoS matrices. Also sets the output directory.
from pathlib import Path
import json, re, random
import pandas as pd
import numpy as np

TOOLBENCH_ROOT = Path("/Users/ishwaryapns/Documents/Thesis/MAOF/data/raw/toolbench/data/toolenv/tools")
WSD_RT = Path("../raw/wsdream/dataset1/rtMatrix.txt")
WSD_TP = Path("../raw/wsdream/dataset1/tpMatrix.txt")

OUT_DIR = Path("../processed/api_catalog_sample_balanced")
OUT_DIR.mkdir(parents=True, exist_ok=True)

NO_QOS_OUT = OUT_DIR / "api_repo.no_qos.jsonl"
WITH_QOS_OUT = OUT_DIR / "api_repo.with_qos.jsonl"
BALANCED_COUNTS = OUT_DIR / "api_repo.balanced_counts.csv"

random.seed(42)
print("ToolBench root:", TOOLBENCH_ROOT.resolve())
print("Output dir:", OUT_DIR.resolve())


ToolBench root: /Users/ishwaryapns/Documents/Thesis/MAOF/data/raw/toolbench/data/toolenv/tools
Output dir: /Users/ishwaryapns/Documents/Thesis/MAOF/data/processed/api_catalog_sample_balanced


In [16]:
# Cell 2: Helper Functions and QoS Computation
# Contains reusable functions to read files, parse JSON, extract endpoints, and compute QoS metrics.

def read_matrix(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, header=None, sep=None, engine="python")

def slug(text: str) -> str:
    s = re.sub(r"[^a-zA-Z0-9]+", "_", (text or "").strip().lower()).strip("_")
    return s[:64] if s else "x"

def safe_get(d, key, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

def yield_endpoints_from_tool(tool_obj: dict, tool_base: str, category: str, file_name: str):
    api_list = safe_get(tool_obj, "api_list", [])
    if not isinstance(api_list, list): return
    for ep in api_list:
        if not isinstance(ep, dict): continue
        name = safe_get(ep, "name") or ""
        url = safe_get(ep, "url") or ""
        method = safe_get(ep, "method") or ""
        desc = safe_get(ep, "description") or ""
        ep_id = f"{tool_base}_{slug(name) or 'endpoint'}"
        yield {
            "api_id": ep_id,
            "category": category,
            "name": name,
            "description": desc,
            "method": method,
            "url": url,
            "_file": file_name,
            "_tool": tool_base,
        }

def iter_endpoints_any_json(json_path: Path, category: str):
    try:
        data = json.loads(json_path.read_text())
    except Exception:
        return
    file_stem = json_path.stem
    tool_base_default = slug(file_stem)
    if isinstance(data, dict) and isinstance(data.get("api_list"), list):
        yield from yield_endpoints_from_tool(data, tool_base_default, category, json_path.name)
    elif isinstance(data, dict) and isinstance(data.get("tools"), list):
        for t in data["tools"]:
            if isinstance(t, dict):
                tool_base = safe_get(t, "standardized_name") or safe_get(t, "name") or tool_base_default
                yield from yield_endpoints_from_tool(t, slug(tool_base), category, json_path.name)
    elif isinstance(data, dict) and isinstance(data.get("endpoints"), list):
        tool_like = {
            "api_list": data["endpoints"],
            "standardized_name": safe_get(data, "standardized_name") or tool_base_default
        }
        yield from yield_endpoints_from_tool(tool_like, slug(tool_like["standardized_name"]), category, json_path.name)
    elif isinstance(data, list):
        for idx, item in enumerate(data):
            if isinstance(item, dict) and isinstance(item.get("api_list"), list):
                tool_base = safe_get(item, "standardized_name") or safe_get(item, "name") or f"{tool_base_default}_{idx}"
                yield from yield_endpoints_from_tool(item, slug(tool_base), category, json_path.name)

def compute_qos_per_column(rt_col: pd.Series, tp_col: pd.Series) -> dict:
    rt_valid = rt_col.replace(-1, np.nan).dropna()
    tp_valid = tp_col.replace(-1, np.nan).dropna()
    availability = len(rt_valid) / len(rt_col) if len(rt_col) > 0 else 0.0
    return {
        "rt_ms": float(np.median(rt_valid)) if not rt_valid.empty else None,
        "tp_rps": float(np.median(tp_valid)) if not tp_valid.empty else None,
        "availability": round(availability, 4),
        "valid_qos": not (rt_valid.empty or tp_valid.empty)
    }


In [17]:
all_endpoints = []
category_dirs = sorted([p for p in TOOLBENCH_ROOT.iterdir() if p.is_dir()])

for cat_dir in category_dirs:
    category = cat_dir.name
    for jf in sorted(cat_dir.glob("*.json")):
        for ep in iter_endpoints_any_json(jf, category):
            all_endpoints.append(ep)

print(f"Total endpoints collected: {len(all_endpoints)}")

seen = {}
unique_endpoints = []
for ep in all_endpoints:
    api_id = ep["api_id"]
    if api_id not in seen:
        seen[api_id] = 1
        unique_endpoints.append(ep)
    else:
        seen[api_id] += 1
        new_id = f"{api_id}-{seen[api_id]}"
        ep = {**ep, "api_id": new_id}
        unique_endpoints.append(ep)

print(f"Unique endpoints after de-duplication: {len(unique_endpoints)}")


Total endpoints collected: 49936
Unique endpoints after de-duplication: 49936


In [18]:
from collections import defaultdict

# Group endpoints by category
cat_to_eps = defaultdict(list)
for ep in unique_endpoints:
    cat_to_eps[ep["category"]].append(ep)

# Count API quotas
category_quota = {}
fallback_categories = []
target_total = 5825
category_count = len(cat_to_eps)

# First, assign max 118 quota to every category
target_per_cat = target_total // category_count

# Track categories with fewer endpoints than quota
for cat, eps in cat_to_eps.items():
    if len(eps) < target_per_cat:
        category_quota[cat] = len(eps)
        fallback_categories.append(cat)
    else:
        category_quota[cat] = target_per_cat

# Compute remaining quota to redistribute
allocated = sum(category_quota.values())
remaining_quota = target_total - allocated
remaining_cats = [cat for cat in cat_to_eps if cat not in fallback_categories]

# Redistribute leftover quota evenly to remaining categories
for cat in remaining_cats:
    if remaining_quota <= 0:
        break
    category_quota[cat] += 1
    remaining_quota -= 1

# Final balanced endpoints list
balanced_endpoints = []
per_category_counts = []

for cat, eps in cat_to_eps.items():
    random.shuffle(eps)
    selected_eps = eps[:category_quota[cat]]
    balanced_endpoints.extend(selected_eps)
    per_category_counts.append({"category": cat, "selected": len(selected_eps)})

print(f"Total balanced endpoints: {len(balanced_endpoints)}")


Total balanced endpoints: 5604


In [19]:
df = pd.DataFrame(unique_endpoints)
quota = 5825
per_cat_quota = quota // df["category"].nunique()

sampled = []
counts = []
for cat, group in df.groupby("category"):
    if len(group) <= per_cat_quota:
        sampled_cat = group.copy()
    else:
        sampled_cat = group.sample(per_cat_quota, random_state=42)
    sampled.append(sampled_cat)
    counts.append({"category": cat, "selected": len(sampled_cat)})

balanced_df = pd.concat(sampled).reset_index(drop=True)
pd.DataFrame(counts).to_csv(BALANCED_COUNTS, index=False)

print(f"Saved balanced category counts -> {BALANCED_COUNTS.name}")
print(pd.DataFrame(counts).head())


Saved balanced category counts -> api_repo.balanced_counts.csv
                                   category  selected
0                               Advertising       118
1  Artificial_Intelligence_Machine_Learning       118
2                                  Business       118
3                         Business_Software       118
4                                  Commerce       118


In [20]:
rt_df = read_matrix(WSD_RT)
tp_df = read_matrix(WSD_TP)

if rt_df.shape != tp_df.shape:
    raise ValueError(f"RT/TP shape mismatch: {rt_df.shape} vs {tp_df.shape}")

n_users, n_cols = rt_df.shape
print(f"WS-DREAM matrix: {n_users} users × {n_cols} API columns")


WS-DREAM matrix: 339 users × 5826 API columns


In [21]:
col_indices = list(range(n_cols))
random.shuffle(col_indices)

with NO_QOS_OUT.open("w", encoding="utf-8") as no_qos_f, \
     WITH_QOS_OUT.open("w", encoding="utf-8") as with_qos_f:

    for i, ep in enumerate(balanced_df.to_dict(orient="records")):
        base = ep  # Keeping everything including _file for future use
        no_qos_f.write(json.dumps(base, ensure_ascii=False) + "\n")

        c = col_indices[i % n_cols]
        qos = compute_qos_per_column(rt_df[c], tp_df[c])
        with_qos_f.write(json.dumps({**base, "qos": qos}, ensure_ascii=False) + "\n")

print(f"Wrote: {NO_QOS_OUT.name}  ({len(balanced_df)} entries)")
print(f"Wrote: {WITH_QOS_OUT.name} ({len(balanced_df)} entries)")


Wrote: api_repo.no_qos.jsonl  (5559 entries)
Wrote: api_repo.with_qos.jsonl (5559 entries)
