Update benchmarks

- Update benchmarks generally - Benchmark against Pydantic V2 instead of V1 - Adds a few additional JSON and MessagePack libraries - Documents the versions of libraries used for each run - Bumps the python version used from 3.9 to 3.11. This made several of the pure-python libraries compared measurably faster. Yay for the faster CPython initiative.
jcrist · Dec 13, 2023 · f71d96f · f71d96f
1 parent dceeec3
commit f71d96f
Show file tree

Hide file tree

Showing 19 changed files with 872 additions and 510 deletions.
diff --git a/README.md b/README.md
@@ -41,15 +41,15 @@ support for [JSON](https://json.org), [MessagePack](https://msgpack.org),
 
 - 🔍 **Zero-cost schema validation** using familiar Python type annotations. In
   [benchmarks](https://jcristharif.com/msgspec/benchmarks.html) `msgspec`
-  decodes *and* validates JSON ~2x faster than
+  decodes *and* validates JSON faster than
   [orjson](https://github.com/ijl/orjson) can decode it alone.
 
 - ✨ **A speedy Struct type** for representing structured data. If you already
   use [dataclasses](https://docs.python.org/3/library/dataclasses.html) or
   [attrs](https://www.attrs.org),
   [structs](https://jcristharif.com/msgspec/structs.html) should feel familiar.
   However, they're
-  [10-100x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
+  [5-60x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
   for common operations.
 
 All of this is included in a

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/bench_encodings.py b/benchmarks/bench_encodings.py
@@ -1,149 +1,197 @@
 from __future__ import annotations
 
+import sys
+import dataclasses
 import json
 import timeit
-from typing import List, Union
+import importlib.metadata
+from typing import Any, Literal, Callable
 
-import msgpack
-import orjson
-import ujson
-from generate_data import make_filesystem_data
+from .generate_data import make_filesystem_data
 
 import msgspec
 
 
-class File(msgspec.Struct, tag="file"):
+class File(msgspec.Struct, kw_only=True, omit_defaults=True, tag="file"):
     name: str
     created_by: str
     created_at: str
-    updated_at: str
+    updated_by: str | None = None
+    updated_at: str | None = None
     nbytes: int
+    permissions: Literal["READ", "WRITE", "READ_WRITE"]
 
 
-class Directory(msgspec.Struct, tag="directory"):
+class Directory(msgspec.Struct, kw_only=True, omit_defaults=True, tag="directory"):
     name: str
     created_by: str
     created_at: str
-    updated_at: str
-    contents: List[Union[File, Directory]]
+    updated_by: str | None = None
+    updated_at: str | None = None
+    contents: list[File | Directory]
 
 
-def bench(dumps, loads, ndata, schema=None):
-    data = make_filesystem_data(ndata)
-    if schema:
-        data = msgspec.convert(data, schema)
-    timer = timeit.Timer("func(data)", globals={"func": dumps, "data": data})
-    n, t = timer.autorange()
-    dumps_time = t / n
+@dataclasses.dataclass
+class Benchmark:
+    label: str
+    version: str
+    encode: Callable
+    decode: Callable
+    schema: Any = None
 
-    data = dumps(data)
+    def run(self, data: bytes) -> dict:
+        if self.schema is not None:
+            data = msgspec.convert(data, self.schema)
+        timer = timeit.Timer("func(data)", globals={"func": self.encode, "data": data})
+        n, t = timer.autorange()
+        encode_time = t / n
 
-    timer = timeit.Timer("func(data)", globals={"func": loads, "data": data})
-    n, t = timer.autorange()
-    loads_time = t / n
-    return dumps_time, loads_time
+        data = self.encode(data)
 
+        timer = timeit.Timer("func(data)", globals={"func": self.decode, "data": data})
+        n, t = timer.autorange()
+        decode_time = t / n
 
-def bench_msgspec_msgpack(n):
-    schema = File if n == 1 else Directory
-    enc = msgspec.msgpack.Encoder()
-    dec = msgspec.msgpack.Decoder(schema)
-    return bench(enc.encode, dec.decode, n, schema)
-
-
-def bench_msgspec_json(n):
-    schema = File if n == 1 else Directory
-    enc = msgspec.json.Encoder()
-    dec = msgspec.json.Decoder(schema)
-    return bench(enc.encode, dec.decode, n, schema)
-
-
-def bench_msgpack(n):
-    packer = msgpack.Packer()
-    loads = msgpack.loads
-    return bench(packer.pack, loads, n)
+        return {
+            "label": self.label,
+            "encode": encode_time,
+            "decode": decode_time,
+        }
 
 
-def bench_ujson(n):
-    return bench(ujson.dumps, ujson.loads, n)
+def json_benchmarks():
+    import orjson
+    import ujson
+    import rapidjson
+    import simdjson
 
+    simdjson_ver = importlib.metadata.version("pysimdjson")
 
-def bench_orjson(n):
-    return bench(orjson.dumps, orjson.loads, n)
+    rj_dumps = rapidjson.Encoder()
+    rj_loads = rapidjson.Decoder()
 
+    def uj_dumps(obj):
+        return ujson.dumps(obj)
 
-BENCHMARKS = [
-    ("ujson", bench_ujson),
-    ("orjson", bench_orjson),
-    ("msgpack", bench_msgpack),
-    ("msgspec msgpack", bench_msgspec_msgpack),
-    ("msgspec json", bench_msgspec_json),
-]
-
+    enc = msgspec.json.Encoder()
+    dec = msgspec.json.Decoder(Directory)
+    dec2 = msgspec.json.Decoder()
 
-def run(n, quiet=False):
-    if quiet:
+    return [
+        Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
+        Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
+        Benchmark("json", None, json.dumps, json.loads),
+        Benchmark("orjson", orjson.__version__, orjson.dumps, orjson.loads),
+        Benchmark("ujson", ujson.__version__, uj_dumps, ujson.loads),
+        Benchmark("rapidjson", rapidjson.__version__, rj_dumps, rj_loads),
+        Benchmark("simdjson", simdjson_ver, simdjson.dumps, simdjson.loads),
+    ]
 
-        def log(x):
-            pass
 
-    else:
-        log = print
+def msgpack_benchmarks():
+    import msgpack
+    import ormsgpack
 
-    title = f"Benchmark - {n} object{'s' if n > 1 else ''}"
-    log(title)
+    enc = msgspec.msgpack.Encoder()
+    dec = msgspec.msgpack.Decoder(Directory)
+    dec2 = msgspec.msgpack.Decoder()
 
-    results = []
-    for name, func in BENCHMARKS:
-        log(name)
-        dumps_time, loads_time = func(n)
-        log(f"  dumps: {dumps_time * 1e6:.2f} us")
-        log(f"  loads: {loads_time * 1e6:.2f} us")
-        log(f"  total: {(dumps_time + loads_time) * 1e6:.2f} us")
-        results.append((name, dumps_time, loads_time))
-    return results
+    return [
+        Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
+        Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
+        Benchmark("msgpack", msgpack.__version__, msgpack.dumps, msgpack.loads),
+        Benchmark(
+            "ormsgpack", ormsgpack.__version__, ormsgpack.packb, ormsgpack.unpackb
+        ),
+    ]
 
 
 def main():
     import argparse
 
-    bench_names = ["1", "1k"]
-
     parser = argparse.ArgumentParser(
-        description="Benchmark different python serializers"
+        description="Benchmark different python serialization libraries"
     )
     parser.add_argument(
-        "--benchmark",
-        "-b",
-        action="append",
-        choices=["all", *bench_names],
-        default=[],
-        help="which benchmark(s) to run, defaults to 'all'",
+        "--versions",
+        action="store_true",
+        help="Output library version info, and exit immediately",
     )
     parser.add_argument(
-        "--json",
-        action="store_true",
-        help="whether to output the results as json",
+        "-n",
+        type=int,
+        help="The number of objects in the generated data, defaults to 1000",
+        default=1000,
     )
     parser.add_argument(
-        "--no-gc",
+        "-p",
+        "--protocol",
+        choices=["json", "msgpack"],
+        default="json",
+        help="The protocol to benchmark, defaults to JSON",
+    )
+    parser.add_argument(
+        "--json",
         action="store_true",
-        help="whether to disable the gc during benchmarking",
+        help="whether to output the results as json",
     )
     args = parser.parse_args()
 
-    if "all" in args.benchmark or not args.benchmark:
-        to_run = bench_names
-    else:
-        to_run = sorted(set(args.benchmark))
+    benchmarks = json_benchmarks() if args.protocol == "json" else msgpack_benchmarks()
+
+    if args.versions:
+        for bench in benchmarks:
+            if bench.version is not None:
+                print(f"- {bench.label}: {bench.version}")
+        sys.exit(0)
 
-    results = {}
-    for bench in to_run:
-        n = 1000 if bench.startswith("1k") else 1
-        results[bench] = run(n, quiet=args.json)
+    data = make_filesystem_data(args.n)
+
+    results = [benchmark.run(data) for benchmark in benchmarks]
 
     if args.json:
-        print(json.dumps(results))
+        for line in results:
+            print(json.dumps(line))
+    else:
+        # Compose the results table
+        results.sort(key=lambda row: row["encode"] + row["decode"])
+        best_et = results[0]["encode"]
+        best_dt = results[0]["decode"]
+        best_tt = best_et + best_dt
+
+        columns = (
+            "",
+            "encode (μs)",
+            "vs.",
+            "decode (μs)",
+            "vs.",
+            "total (μs)",
+            "vs.",
+        )
+        rows = [
+            (
+                r["label"],
+                f"{1_000_000 * r['encode']:.1f}",
+                f"{r['encode'] / best_et:.1f}",
+                f"{1_000_000 * r['decode']:.1f}",
+                f"{r['decode'] / best_dt:.1f}",
+                f"{1_000_000 * (r['encode'] + r['decode']):.1f}",
+                f"{(r['encode'] + r['decode']) / best_tt:.1f}",
+            )
+            for r in results
+        ]
+        widths = tuple(
+            max(max(map(len, x)), len(c)) for x, c in zip(zip(*rows), columns)
+        )
+        row_template = ("|" + (" %%-%ds |" * len(columns))) % widths
+        header = row_template % tuple(columns)
+        bar_underline = "+%s+" % "+".join("=" * (w + 2) for w in widths)
+        bar = "+%s+" % "+".join("-" * (w + 2) for w in widths)
+        parts = [bar, header, bar_underline]
+        for r in rows:
+            parts.append(row_template % r)
+            parts.append(bar)
+        print("\n".join(parts))
 
 
 if __name__ == "__main__":

diff --git a/benchmarks/bench_memory.py → benchmarks/bench_large_json.py b/benchmarks/bench_memory.py → benchmarks/bench_large_json.py
@@ -42,6 +42,11 @@
 decode = orjson.loads
 """
 
+RAPIDJSON = """
+import rapidjson
+decode = rapidjson.loads
+"""
+
 SIMDJSON = """
 import simdjson
 decode = simdjson.loads
@@ -81,15 +86,37 @@ class RepoData(msgspec.Struct, gc=False):
 
 
 def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Benchmark decoding a large JSON message using various JSON libraries"
+    )
+    parser.add_argument(
+        "--versions",
+        action="store_true",
+        help="Output library version info, and exit immediately",
+    )
+    args = parser.parse_args()
+
     benchmarks = [
-        ("json", JSON),
-        ("ujson", UJSON),
-        ("orjson", ORJSON),
-        ("simdjson", SIMDJSON),
-        ("msgspec", MSGSPEC),
-        ("msgspec structs", MSGSPEC_STRUCTS),
+        ("json", None, JSON),
+        ("ujson", "ujson", UJSON),
+        ("orjson", "orjson", ORJSON),
+        ("rapidjson", "python-rapidjson", RAPIDJSON),
+        ("simdjson", "pysimdjson", SIMDJSON),
+        ("msgspec", "msgspec", MSGSPEC),
+        ("msgspec structs", None, MSGSPEC_STRUCTS),
     ]
 
+    if args.versions:
+        import importlib.metadata
+
+        for _, lib, _ in benchmarks:
+            if lib is not None:
+                version = importlib.metadata.version(lib)
+                print(f"- {lib}: {version}")
+        sys.exit(0)
+
     with tempfile.NamedTemporaryFile() as f:
         # Download the repodata.json
         resp = requests.get(
@@ -102,7 +129,7 @@ def main():
         results = {}
         import ast
 
-        for lib, setup in benchmarks:
+        for lib, _, setup in benchmarks:
             script = TEMPLATE.format(path=f.name, setup=setup)
             # We execute each script in a subprocess to isolate their memory usage
             output = subprocess.check_output([sys.executable, "-c", script])