In [None]:
# Cell 1: Setup & paths
from pathlib import Path
import json
import pandas as pd

# >>> EDIT THIS to your absolute path <<<
TOOLBENCH_ROOT = Path("/Users/ishwaryapns/Documents/Thesis/LAMAS/Toolbench/Tootbench_dataset/data/toolenv/tools")

RESULTS_DIR = Path("../results/api_inventory")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

OUT_PER_CATEGORY = RESULTS_DIR / "toolbench_api_counts.by_category.csv"
OUT_PER_FILE     = RESULTS_DIR / "toolbench_api_counts.by_file.csv"  # optional/debug


In [None]:
# Cell 2: Helper to count endpoints in a ToolBench JSON
def count_endpoints_in_json(path: Path):
    """
    Return the number of endpoints in a ToolBench JSON, handling multiple shapes.
    Returns None if the file isn't readable JSON.
    """
    try:
        data = json.loads(path.read_text())
    except Exception:
        return None

    # Primary expected structure
    if isinstance(data, dict) and isinstance(data.get("api_list"), list):
        return len(data["api_list"])

    # Fallback 1: 'tools' wrapping multiple items (each with api_list)
    if isinstance(data, dict) and isinstance(data.get("tools"), list):
        return sum(
            len(t.get("api_list", [])) for t in data["tools"] if isinstance(t, dict)
        )

    # Fallback 2: 'endpoints' key
    if isinstance(data, dict) and isinstance(data.get("endpoints"), list):
        return len(data["endpoints"])

    # Fallback 3: file is a list of items (each may have api_list)
    if isinstance(data, list):
        n = 0
        for item in data:
            if isinstance(item, dict) and isinstance(item.get("api_list"), list):
                n += len(item["api_list"])
        return n

    return 0


In [None]:
# Cell 3: Scan all categories & files
if not TOOLBENCH_ROOT.exists():
    raise SystemExit(f"Root not found: {TOOLBENCH_ROOT}")

rows = []  # per-file rows for optional debugging/export

categories = sorted([p for p in TOOLBENCH_ROOT.iterdir() if p.is_dir()])
print(f"Found {len(categories)} categories under {TOOLBENCH_ROOT}")

for cat_dir in categories:
    json_files = sorted(cat_dir.glob("*.json"))
    for jf in json_files:
        n = count_endpoints_in_json(jf)
        rows.append({
            "category": cat_dir.name,
            "file": jf.name,
            "endpoints": n if n is not None else None
        })

df_files = pd.DataFrame(rows)


Found 50 categories under /Users/ishwaryapns/Documents/Thesis/LAMAS/Toolbench/Tootbench_dataset/data/toolenv/tools


In [None]:
# Cell 4: Report totals per category
if df_files.empty:
    raise SystemExit("No JSON files found under the given root.")

# Convert None to NaN so sums can skip them cleanly
df_files["endpoints"] = pd.to_numeric(df_files["endpoints"], errors="coerce")

summary = (
    df_files.groupby("category", dropna=False)
            .agg(total_endpoints=("endpoints", "sum"),
                 num_files=("file", "count"))
            .sort_values("total_endpoints", ascending=False)
            .reset_index()
)

overall_total = int(summary["total_endpoints"].sum()) if not summary.empty else 0

print("\n=== API endpoint totals by category ===")
for _, r in summary.iterrows():
    total_e = int(r["total_endpoints"]) if pd.notna(r["total_endpoints"]) else 0
    print(f"{r['category']}: {total_e} endpoints (from {r['num_files']} files)")

print(f"\nOverall endpoints across all categories: {overall_total}")



=== API endpoint totals by category ===
Sports: 4455 endpoints (from 361 files)
Finance: 3834 endpoints (from 559 files)
Data: 3583 endpoints (from 831 files)
Business_Software: 3550 endpoints (from 350 files)
Social: 2392 endpoints (from 348 files)
Business: 2148 endpoints (from 695 files)
eCommerce: 2037 endpoints (from 441 files)
Tools: 1916 endpoints (from 704 files)
Other: 1806 endpoints (from 363 files)
Entertainment: 1421 endpoints (from 411 files)
Gaming: 1253 endpoints (from 248 files)
Communication: 1173 endpoints (from 252 files)
Email: 1140 endpoints (from 161 files)
Financial: 1136 endpoints (from 215 files)
Text_Analysis: 1129 endpoints (from 466 files)
Music: 1030 endpoints (from 152 files)
Artificial_Intelligence_Machine_Learning: 990 endpoints (from 390 files)
Video_Images: 986 endpoints (from 280 files)
Education: 925 endpoints (from 294 files)
Media: 916 endpoints (from 194 files)
Database: 900 endpoints (from 277 files)
Commerce: 811 endpoints (from 214 files)
Stor

In [None]:
# Cell 5: Save CSV outputs
summary.to_csv(OUT_PER_CATEGORY, index=False)
df_files.sort_values(["category","endpoints"], ascending=[True, False]).to_csv(OUT_PER_FILE, index=False)

print(f"\nSaved per-category totals -> {OUT_PER_CATEGORY}")
print(f"Saved per-file counts (optional/debug) -> {OUT_PER_FILE}")

# Show the top rows inline
summary.head(20)



Saved per-category totals -> ../results/toolbench_api_counts.by_category.csv
Saved per-file counts (optional/debug) -> ../results/toolbench_api_counts.by_file.csv


Unnamed: 0,category,total_endpoints,num_files
0,Sports,4455,361
1,Finance,3834,559
2,Data,3583,831
3,Business_Software,3550,350
4,Social,2392,348
5,Business,2148,695
6,eCommerce,2037,441
7,Tools,1916,704
8,Other,1806,363
9,Entertainment,1421,411
