diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index 57fd195f48bb..94f5f1153219 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -385,6 +385,46 @@ def createTorchSlmSizeBench(variant_name: str, **kwargs): ), ] + # Add TorchLinearKernelSize benchmarks + for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES): + + def createTorchLinearKernelSizeBench(variant_name: str, **kwargs): + return TorchLinearKernelSize( + self, + runtime, + variant_name, + PROFILERS.TIMER, + **kwargs, + ) + + benches += [ + createTorchLinearKernelSizeBench( + "array32", + kernelBatchSize=512, + kernelSize=32, + ), + createTorchLinearKernelSizeBench( + "array128", + kernelBatchSize=512, + kernelSize=128, + ), + createTorchLinearKernelSizeBench( + "array512", + kernelBatchSize=512, + kernelSize=512, + ), + createTorchLinearKernelSizeBench( + "array1024", + kernelBatchSize=512, + kernelSize=1024, + ), + createTorchLinearKernelSizeBench( + "array5120", + kernelBatchSize=512, + kernelSize=5120, + ), + ] + # Add UR-specific benchmarks benches += [ # TODO: multithread_benchmark_ur fails with segfault @@ -916,6 +956,20 @@ def __init__( ) +class TorchLinearKernelSize(TorchBenchmark): + def __init__( + self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs + ): + super().__init__( + suite, + runtime, + "KernelSubmitLinearKernelSize", + variant_name, + profiler_type, + **kwargs, + ) + + class QueueInOrderMemcpy(ComputeBenchmark): def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type): self._is_copy_only = isCopyOnly diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py index f35b06ed1fc1..ef98e02a09bd 100644 --- a/devops/scripts/benchmarks/tests/test_integration.py +++ b/devops/scripts/benchmarks/tests/test_integration.py @@ -93,8 +93,14 @@ def run_main(self, *args): ], capture_output=True, ) - print("MAIN_PY_STDOUT:\n" + proc.stdout.decode() if proc.stdout else "") - print("MAIN_PY_STDERR:\n" + proc.stderr.decode() if proc.stderr else "") + print( + "MAIN_PY_STDOUT:", + "\n" + proc.stdout.decode() if proc.stdout else " ", + ) + print( + "MAIN_PY_STDERR:", + "\n" + proc.stderr.decode() if proc.stderr else " ", + ) return proc.returncode def get_output(self): @@ -199,6 +205,11 @@ def test_torch_l0(self): "KernelSubmitSlmSize small", {"pytorch", "L0"}, ) + self._checkCase( + "torch_benchmark_l0 kernelBatchSize 512, kernelSize 32", + "KernelSubmitLinearKernelSize array32", + {"pytorch", "L0"}, + ) def test_torch_sycl(self): self._checkCase( @@ -211,6 +222,11 @@ def test_torch_sycl(self): "KernelSubmitSlmSize max", {"pytorch", "SYCL"}, ) + self._checkCase( + "torch_benchmark_sycl kernelBatchSize 512, kernelSize 5120", + "KernelSubmitLinearKernelSize array5120", + {"pytorch", "SYCL"}, + ) def test_torch_syclpreview(self): self._checkCase( @@ -223,6 +239,11 @@ def test_torch_syclpreview(self): "KernelSubmitSlmSize medium", {"pytorch", "SYCL"}, ) + self._checkCase( + "torch_benchmark_syclpreview kernelBatchSize 512, kernelSize 512", + "KernelSubmitLinearKernelSize array512", + {"pytorch", "SYCL"}, + ) if __name__ == "__main__":