# jsonld-ex Reproducible Benchmark Suite

**Purpose:** Run the complete jsonld-ex benchmark suite on this system and save
timestamped, system-fingerprinted results for cross-platform comparison.

**For paper:** *jsonld-ex: Backward-Compatible JSON-LD Extensions for AI/ML Data Exchange*

**Methodology:**
- 30 trials per measurement, 3 warmup iterations
- 95% confidence intervals via t-distribution
- Fixed random seed (42) for deterministic data generation
- System fingerprint captured for hardware-aware comparison

**What should be invariant across systems:**
- Byte ratios (PROV-O verbosity, SHACL verbosity, IoT payload savings)
- Calibration metrics (ECE, Brier score) — deterministic with fixed seed
- Information richness results — algebraic identity
- Trust ≡ scalar equivalence — numerical identity within 1e-12
- Scaling *shape* (linear, sublinear) for fusion, trust chains, merge

**What will vary (expected):**
- Absolute throughput (ops/sec, μs/op) — hardware-dependent
- Speedup ratios vs baselines — may shift ±20% across CPUs

---

## 1. System Fingerprint

Captures comprehensive hardware/software info for reproducibility.

In [None]:
import json
import os
import platform
import subprocess
import sys
from datetime import datetime, timezone


def get_system_fingerprint() -> dict:
    """Collect comprehensive system information."""
    info = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "platform": {
            "system": platform.system(),
            "release": platform.release(),
            "version": platform.version(),
            "machine": platform.machine(),
            "processor": platform.processor(),
            "python_version": sys.version,
            "python_implementation": platform.python_implementation(),
        },
        "cpu": {},
        "memory": {},
        "gpu": {},
        "environment": {},
    }

    # CPU details
    try:
        import multiprocessing
        info["cpu"]["logical_cores"] = multiprocessing.cpu_count()
    except Exception:
        pass

    # Linux-specific CPU info
    if platform.system() == "Linux":
        try:
            with open("/proc/cpuinfo") as f:
                cpuinfo = f.read()
            for line in cpuinfo.split("\n"):
                if "model name" in line:
                    info["cpu"]["model"] = line.split(":")[1].strip()
                    break
        except Exception:
            pass
        try:
            with open("/proc/meminfo") as f:
                for line in f:
                    if "MemTotal" in line:
                        kb = int(line.split()[1])
                        info["memory"]["total_gb"] = round(kb / 1024 / 1024, 1)
                        break
        except Exception:
            pass

    # Windows-specific CPU info
    elif platform.system() == "Windows":
        try:
            result = subprocess.run(
                ["wmic", "cpu", "get", "Name", "/value"],
                capture_output=True, text=True, timeout=5
            )
            for line in result.stdout.split("\n"):
                if "Name=" in line:
                    info["cpu"]["model"] = line.split("=")[1].strip()
        except Exception:
            info["cpu"]["model"] = platform.processor()

    # macOS-specific CPU info
    elif platform.system() == "Darwin":
        try:
            result = subprocess.run(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                capture_output=True, text=True, timeout=5
            )
            info["cpu"]["model"] = result.stdout.strip()
        except Exception:
            pass
        try:
            result = subprocess.run(
                ["sysctl", "-n", "hw.memsize"],
                capture_output=True, text=True, timeout=5
            )
            info["memory"]["total_gb"] = round(int(result.stdout.strip()) / 1024**3, 1)
        except Exception:
            pass

    # GPU detection
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=name,memory.total,driver_version",
             "--format=csv,noheader,nounits"],
            capture_output=True, text=True, timeout=10
        )
        if result.returncode == 0 and result.stdout.strip():
            parts = result.stdout.strip().split(", ")
            info["gpu"]["name"] = parts[0] if len(parts) > 0 else "unknown"
            info["gpu"]["memory_mb"] = int(parts[1]) if len(parts) > 1 else 0
            info["gpu"]["driver"] = parts[2] if len(parts) > 2 else "unknown"
    except (FileNotFoundError, Exception):
        info["gpu"]["available"] = False

    # Detect Colab / cloud environment
    if "COLAB_RELEASE_TAG" in os.environ:
        info["environment"]["type"] = "google_colab"
        info["environment"]["colab_tag"] = os.environ.get("COLAB_RELEASE_TAG", "")
    elif "CODESPACES" in os.environ:
        info["environment"]["type"] = "github_codespaces"
    elif os.path.exists("/proc/1/cgroup"):
        try:
            with open("/proc/1/cgroup") as f:
                if "docker" in f.read():
                    info["environment"]["type"] = "docker"
        except Exception:
            pass
    if "type" not in info["environment"]:
        info["environment"]["type"] = "local"

    return info


fingerprint = get_system_fingerprint()
print(json.dumps(fingerprint, indent=2))

## 2. Install Dependencies

In [None]:
# Install from source (ensures benchmarks match the exact code version)
# PyPI may lag behind - source install guarantees consistency
!pip install -q "jsonld-ex[iot,bench] @ git+https://github.com/jemsbhai/jsonld-ex.git#subdirectory=packages/python" cbor2

In [None]:
# Verify installation
import jsonld_ex
print(f"jsonld-ex version: {jsonld_ex.__version__}")

# Quick smoke test
from jsonld_ex.confidence_algebra import Opinion, cumulative_fuse
o1 = Opinion(belief=0.7, disbelief=0.1, uncertainty=0.2)
o2 = Opinion(belief=0.5, disbelief=0.3, uncertainty=0.2)
fused = cumulative_fuse(o1, o2)
print(f"Smoke test: fuse({o1.belief:.1f}, {o2.belief:.1f}) -> belief={fused.belief:.4f} ✓")

In [None]:
# Clone the repo for benchmark scripts
# (benchmark runner scripts are not part of the PyPI package)
!git clone --depth 1 https://github.com/jemsbhai/jsonld-ex.git /tmp/jsonld-ex-repo 2>/dev/null || \
    (cd /tmp/jsonld-ex-repo && git pull --ff-only)

import os
os.chdir("/tmp/jsonld-ex-repo/benchmarks")
print(f"Working directory: {os.getcwd()}")
print(f"Benchmark files: {sorted(f for f in os.listdir('.') if f.startswith('bench_'))}")

## 3. Run Full Benchmark Suite

This takes **5-15 minutes** depending on hardware. All 6 domains + baselines,
30 trials each with 3 warmup iterations.

In [None]:
import sys
import os

# Ensure the benchmark directory and package source are on the path
bench_dir = "/tmp/jsonld-ex-repo/benchmarks"
src_dir = "/tmp/jsonld-ex-repo/packages/python/src"
if bench_dir not in sys.path:
    sys.path.insert(0, bench_dir)
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

os.chdir(bench_dir)

# Run each domain individually for better progress visibility
import time
import json
from datetime import datetime, timezone

overall_start = time.perf_counter()
all_results = {}

print("Running Domain 1: OWL/RDF Ecosystem Interoperability...")
import bench_owl_rdf
d1 = bench_owl_rdf.run_all()
all_results["domain_1_owl_rdf"] = {
    "prov_o_verbosity": d1.prov_o_verbosity,
    "shacl_verbosity": d1.shacl_verbosity,
    "round_trip_fidelity": d1.round_trip_fidelity,
    "conversion_throughput": d1.conversion_throughput,
}
print(f"  ✓ Domain 1 complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Domain 2: Multi-Agent KG Construction...")
import bench_multi_agent
d2 = bench_multi_agent.run_all()
all_results["domain_2_multi_agent"] = {
    "merge_throughput": d2.merge_throughput,
    "merge_by_conflict_rate": d2.merge_by_conflict_rate,
    "propagation_overhead": d2.propagation_overhead,
    "combination_comparison": d2.combination_comparison,
    "diff_throughput": d2.diff_throughput,
}
print(f"  ✓ Domain 2 complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Domain 3: Healthcare IoT Pipeline...")
import bench_iot
d3 = bench_iot.run_all()
all_results["domain_3_iot"] = {
    "payload_sizes": d3.payload_sizes,
    "pipeline_throughput": d3.pipeline_throughput,
    "mqtt_overhead": d3.mqtt_overhead,
    "batch_scaling": d3.batch_scaling,
}
print(f"  ✓ Domain 3 complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Domain 4: RAG Pipeline & Temporal Queries...")
import bench_rag
d4 = bench_rag.run_all()
all_results["domain_4_rag"] = {
    "confidence_filter": d4.confidence_filter,
    "temporal_query": d4.temporal_query,
    "temporal_diff": d4.temporal_diff_bench,
    "rag_pipeline": d4.rag_pipeline,
}
print(f"  ✓ Domain 4 complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Baselines: rdflib, pyshacl comparisons...")
import bench_baselines
db = bench_baselines.run_all()
all_results["baselines"] = {
    "prov_o_construction": db.prov_o_construction,
    "shacl_validation": db.shacl_validation,
    "graph_merge": db.graph_merge,
    "temporal_query": db.temporal_query,
}
print(f"  ✓ Baselines complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Domain 5: Confidence Algebra (Subjective Logic)...")
import bench_algebra
d5 = bench_algebra.run_all()
all_results["domain_5_confidence_algebra"] = {
    "cumulative_fusion": d5.cumulative_fusion,
    "averaging_fusion": d5.averaging_fusion,
    "trust_discount_chain": d5.trust_discount_chain,
    "trust_vs_scalar": d5.trust_vs_scalar,
    "deduction": d5.deduction,
    "temporal_decay": d5.temporal_decay,
    "opinion_formation": d5.opinion_formation,
    "information_richness": d5.information_richness,
    "calibration": d5.calibration,
}
print(f"  ✓ Domain 5 complete ({time.perf_counter() - overall_start:.1f}s elapsed)\n")

In [None]:
print("Running Domain 6: Neuro-Symbolic Bridge Pipeline...")
import bench_bridge
d6 = bench_bridge.run_all()
all_results["domain_6_neuro_symbolic_bridge"] = {
    "pipeline_comparison": d6.pipeline_comparison,
    "metadata_richness": d6.metadata_richness,
}

total_sec = time.perf_counter() - overall_start
print(f"  ✓ Domain 6 complete")
print(f"\n{'='*60}")
print(f"ALL BENCHMARKS COMPLETE in {total_sec:.1f}s")
print(f"{'='*60}")

## 4. Assemble & Save Results

Results are saved with:
- **Timestamp** — when this run occurred
- **System fingerprint** — exact hardware/software description
- **Package version** — jsonld-ex version under test
- **Git commit** — exact source code version

In [None]:
import subprocess

# Get git commit hash for exact reproducibility
try:
    git_hash = subprocess.run(
        ["git", "rev-parse", "HEAD"],
        capture_output=True, text=True, timeout=5,
        cwd="/tmp/jsonld-ex-repo"
    ).stdout.strip()
except Exception:
    git_hash = "unknown"

try:
    import jsonld_ex
    pkg_version = jsonld_ex.__version__
except Exception:
    pkg_version = "unknown"

# Build system label for filenames
cpu_short = fingerprint.get("cpu", {}).get("model", "unknown-cpu")
# Sanitize for filename: keep only alphanumeric, dash, underscore
cpu_label = "".join(c if c.isalnum() or c in "-_" else "-" for c in cpu_short)[:40]
env_type = fingerprint.get("environment", {}).get("type", "local")

ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
system_label = f"{env_type}_{cpu_label}"

# Assemble final results
final_results = {
    "metadata": {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "total_seconds": round(total_sec, 2),
        "jsonld_ex_version": pkg_version,
        "git_commit": git_hash,
        "python_version": sys.version.split()[0],
        "system_label": system_label,
    },
    "system_fingerprint": fingerprint,
    **all_results,
}

# Save with timestamped + system-labeled filename
out_dir = "/tmp/jsonld-ex-repo/benchmarks/results"
os.makedirs(out_dir, exist_ok=True)

filename = f"benchmark_results_{ts}_{system_label}.json"
filepath = os.path.join(out_dir, filename)

with open(filepath, "w") as f:
    json.dump(final_results, f, indent=2)

print(f"Results saved to: {filepath}")
print(f"File size: {os.path.getsize(filepath) / 1024:.1f} KB")
print(f"System label: {system_label}")
print(f"Git commit: {git_hash[:12]}")

## 5. Invariant Verification

These values **must** be identical across all systems (deterministic, fixed seed).
Any deviation indicates a bug or version mismatch.

In [None]:
print("=" * 60)
print("CROSS-SYSTEM INVARIANT CHECKS")
print("=" * 60)
print()

checks_passed = 0
checks_failed = 0

def check(name, actual, expected, tolerance=0.0):
    global checks_passed, checks_failed
    if isinstance(expected, bool):
        ok = actual == expected
    elif tolerance > 0:
        ok = abs(actual - expected) <= tolerance
    else:
        ok = actual == expected
    status = "✓ PASS" if ok else "✗ FAIL"
    if ok:
        checks_passed += 1
    else:
        checks_failed += 1
    print(f"  {status}: {name} = {actual} (expected {expected})")

# 1. Trust ≡ scalar equivalence (must all be True)
print("\n--- Trust ≡ Scalar Equivalence ---")
for k, v in d5.trust_vs_scalar.items():
    check(f"trust_vs_scalar[{k}].equivalent", v["numerically_equivalent"], True)

# 2. Information richness (all pairs must have same scalar)
print("\n--- Information Richness (same scalar) ---")
for k, v in d5.information_richness.items():
    check(f"info_richness[{k}].same_scalar", v["same_scalar"], True)

# 3. Calibration ECE and Brier (deterministic with seed 42)
print("\n--- Calibration Metrics (deterministic) ---")
check("ECE", d5.calibration["expected_calibration_error"], 0.0340, tolerance=0.0001)
check("Brier", d5.calibration["brier_score"], 0.1880, tolerance=0.0001)

# 4. Byte ratios (deterministic — same data, same serialization)
print("\n--- Byte Ratios (deterministic) ---")
for k, v in d1.prov_o_verbosity.items():
    check(f"prov_o_byte_ratio[{k}]", v["byte_ratio"], v["byte_ratio"])  # self-check structure

# 5. Round-trip fidelity
print("\n--- Round-trip Fidelity ---")
check("round_trip_fidelity", d1.round_trip_fidelity["fidelity"], 1.0)

# 6. Validation pass rate in bridge pipeline
print("\n--- Bridge Pipeline Validation ---")
if "n=100" in d6.pipeline_comparison:
    bridge_100 = d6.pipeline_comparison["n=100"]
    check("bridge_invalid_nodes", bridge_100["jsonld_ex"]["metadata"]["invalid_nodes"], 0)

# 7. Metadata dimensions
print("\n--- Metadata Richness ---")
check("jsonld_ex_dimensions", d6.metadata_richness["jsonld_ex_preserves"]["metadata_dimensions"], 10)
check("adhoc_dimensions", d6.metadata_richness["adhoc_preserves"]["metadata_dimensions"], 2)

print(f"\n{'='*60}")
print(f"INVARIANT CHECKS: {checks_passed} passed, {checks_failed} failed")
if checks_failed > 0:
    print("⚠ FAILURES detected — investigate version mismatch or bug")
else:
    print("✓ All invariants hold — results are cross-system consistent")
print(f"{'='*60}")

## 6. Key Results Summary

In [None]:
print("=" * 60)
print(f"KEY RESULTS — {system_label}")
print("=" * 60)

# Algebra
print("\n--- Confidence Algebra ---")
cf2 = d5.cumulative_fusion["n=2"]
print(f"  Binary cumulative fusion: {cf2['mean_us']:.2f} μs ({cf2['ops_per_sec']:,.0f} ops/sec)")
af100 = d5.averaging_fusion["n=100"]
print(f"  100-way averaging fusion: {af100['mean_us']:.2f} μs ({af100['ops_per_sec']:,.0f} ops/sec)")
print(f"  Calibration ECE: {d5.calibration['expected_calibration_error']:.4f}")
print(f"  Calibration Brier: {d5.calibration['brier_score']:.4f}")

# Baselines
print("\n--- Baseline Speedups ---")
for k, v in db.prov_o_construction.items():
    print(f"  PROV-O [{k}]: {v['speedup']}x faster than rdflib")
for k, v in db.graph_merge.items():
    print(f"  Merge [{k}]: {v['speedup']}x faster than rdflib+SPARQL")

# Bridge
print("\n--- Neuro-Symbolic Bridge ---")
for k, v in d6.pipeline_comparison.items():
    jl = v['jsonld_ex']
    print(f"  {k}: {jl['mean_sec']*1000:.1f}ms "
          f"({jl['nodes_per_sec']:,.0f} nodes/sec, "
          f"{v['overhead_factor']}x vs ad-hoc, "
          f"10 vs 2 metadata dims)")

# IoT
print("\n--- IoT Payload ---")
for k, v in d3.payload_sizes.items():
    print(f"  {k}: {v['savings_pct']}% reduction (JSON {v['json_bytes']:,}B → gzip+CBOR {v['gzip_cbor_bytes']:,}B)")

## 7. Download Results

Download the JSON results file for cross-system comparison.

**Collaborators:** Please share your results file so we can compile
the cross-system comparison table for the paper.

In [None]:
# For Google Colab: auto-download
try:
    from google.colab import files
    files.download(filepath)
    print(f"Downloaded: {filename}")
except ImportError:
    # Not in Colab — just print the path
    print(f"Results saved at: {filepath}")
    print(f"Copy this file and share it for cross-system comparison.")

## 8. Cross-System Comparison (Optional)

If you have results from multiple systems, place them all in the
`benchmarks/results/` directory and run this cell to generate a
comparison table.

In [None]:
import glob

result_files = sorted(glob.glob(os.path.join(out_dir, "benchmark_results_2*.json")))
print(f"Found {len(result_files)} result file(s):\n")

if len(result_files) >= 2:
    comparison_rows = []
    for rf in result_files:
        with open(rf) as f:
            data = json.load(f)
        meta = data.get("metadata", {})
        fp = data.get("system_fingerprint", {})
        cpu = fp.get("cpu", {}).get("model", "unknown")
        env = fp.get("environment", {}).get("type", "unknown")
        cores = fp.get("cpu", {}).get("logical_cores", "?")
        ram = fp.get("memory", {}).get("total_gb", "?")

        # Extract key metrics
        d5_data = data.get("domain_5_confidence_algebra", {})
        cum_n2 = d5_data.get("cumulative_fusion", {}).get("n=2", {})
        cal = d5_data.get("calibration", {})

        baselines = data.get("baselines", {})
        prov_speedups = []
        for k, v in baselines.get("prov_o_construction", {}).items():
            prov_speedups.append(v.get("speedup", 0))

        bridge = data.get("domain_6_neuro_symbolic_bridge", {}).get("pipeline_comparison", {})
        bridge_1k = bridge.get("n=1000", {})
        jl_1k = bridge_1k.get("jsonld_ex", {})

        row = {
            "file": os.path.basename(rf),
            "env": env,
            "cpu": cpu[:30],
            "cores": cores,
            "ram_gb": ram,
            "fusion_us": cum_n2.get("mean_us", "?"),
            "fusion_ops": cum_n2.get("ops_per_sec", "?"),
            "ECE": cal.get("expected_calibration_error", "?"),
            "brier": cal.get("brier_score", "?"),
            "prov_speedup_avg": round(sum(prov_speedups)/len(prov_speedups), 1) if prov_speedups else "?",
            "bridge_1k_ms": round(jl_1k.get("mean_sec", 0)*1000, 1) if jl_1k else "?",
            "bridge_1k_nps": jl_1k.get("nodes_per_sec", "?"),
        }
        comparison_rows.append(row)

    # Print comparison table
    print(f"{'System':<35} {'Cores':<6} {'RAM':<6} {'Fusion(μs)':<11} "
          f"{'Ops/s':<10} {'ECE':<7} {'Brier':<7} {'PROV-O↑':<8} "
          f"{'Bridge(ms)':<11} {'nodes/s':<10}")
    print("-" * 130)
    for r in comparison_rows:
        print(f"{r['cpu']:<35} {r['cores']:<6} {r['ram_gb']:<6} {r['fusion_us']:<11} "
              f"{r['fusion_ops']:<10} {r['ECE']:<7} {r['brier']:<7} {r['prov_speedup_avg']:<8} "
              f"{r['bridge_1k_ms']:<11} {r['bridge_1k_nps']:<10}")

    # Verify invariants match across systems
    eces = [r['ECE'] for r in comparison_rows if isinstance(r['ECE'], float)]
    if len(eces) >= 2:
        if all(abs(e - eces[0]) < 0.001 for e in eces):
            print(f"\n✓ ECE invariant holds across all {len(eces)} systems")
        else:
            print(f"\n✗ ECE varies across systems — investigate version mismatch!")
else:
    for rf in result_files:
        print(f"  {os.path.basename(rf)}")
    print("\nNeed ≥2 result files for cross-system comparison.")
    print("Share this notebook with collaborators to collect more runs.")