diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index bdf263f21c754..abc15801d88fb 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -54,7 +54,7 @@ def git_url(self) -> str: return "https://github.com/intel/compute-benchmarks.git" def git_hash(self) -> str: - return "c9e135d4f26dd6badd83009f92f25d6285fc1e21" + return "4995560017559849a519e58978a0afdd55903e15" def setup(self) -> None: if options.sycl is None: @@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]: # See SubmitKernel.enabled() long_kernel_exec_time_ooo = [20, 200] + # The Combo Profiler is available only for selected sycl benchmarks + profiler_types = ["timer", "cpuCounter"] + for runtime in list(RUNTIMES): # Add SubmitKernel benchmarks using loops for in_order_queue in [0, 1]: @@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]: else long_kernel_exec_time_ooo ) for kernel_exec_time in [1, *long_kernel_exec_time]: - benches.append( - SubmitKernel( - self, - runtime, - in_order_queue, - measure_completion, - use_events, - kernel_exec_time, + for profiler_type in profiler_types: + benches.append( + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + profiler_type, + ) ) - ) # Add SinKernelGraph benchmarks for with_graphs in [0, 1]: @@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]: ) # Add ULLS benchmarks - benches.append(UllsEmptyKernel(self, runtime, 1000, 256)) + for profiler_type in profiler_types: + benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type)) benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1)) # Add GraphApiSubmitGraph benchmarks for in_order_queue in [0, 1]: - benches.append( - GraphApiSubmitGraph( - self, - runtime, - in_order_queue, - self.submit_graph_num_kernels[-1], - 0, - useEvents=0, - useHostTasks=1, + for profiler_type in profiler_types: + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + self.submit_graph_num_kernels[-1], + 0, + profiler_type, + useEvents=0, + useHostTasks=1, + ) ) - ) for num_kernels in self.submit_graph_num_kernels: for measure_completion_time in [0, 1]: for use_events in [0, 1]: - benches.append( - GraphApiSubmitGraph( - self, - runtime, - in_order_queue, - num_kernels, - measure_completion_time, - use_events, - useHostTasks=0, + for profiler_type in profiler_types: + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + profiler_type, + use_events, + useHostTasks=0, + ) ) - ) # Add other benchmarks benches += [ - QueueInOrderMemcpy(self, 0, "Device", "Device", 1024), - QueueInOrderMemcpy(self, 0, "Host", "Device", 1024), - QueueMemcpy(self, "Device", "Device", 1024), StreamMemory(self, "Triad", 10 * 1024, "Device"), - ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024), - ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024), VectorSum(self), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"), ] + for profiler_type in profiler_types: + benches.append( + QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type) + ) + benches.append( + QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type) + ) + benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type)) + benches.append( + ExecImmediateCopyQueue( + self, 0, 1, "Device", "Device", 1024, profiler_type + ) + ) + benches.append( + ExecImmediateCopyQueue( + self, 1, 1, "Device", "Host", 1024, profiler_type + ) + ) # Add UR-specific benchmarks benches += [ @@ -299,12 +322,15 @@ def parse_unit_type(compute_unit): class ComputeBenchmark(Benchmark): - def __init__(self, bench, name, test, runtime: RUNTIMES = None): + def __init__( + self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = "" + ): super().__init__(bench.directory, bench) self.bench = bench self.bench_name = name self.test = test self.runtime = runtime + self.profiler_type = profiler_type # Mandatory per-benchmark iteration counts. # Subclasses MUST set both `self.iterations_regular` and # `self.iterations_trace` (positive ints) in their __init__ before @@ -465,6 +491,7 @@ def __init__( MeasureCompletion=0, UseEvents=0, KernelExecTime=1, + profiler_type="", ): self.ioq = ioq self.MeasureCompletion = MeasureCompletion @@ -475,7 +502,11 @@ def __init__( self.iterations_regular = 100000 self.iterations_trace = 10 super().__init__( - bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime + bench, + f"api_overhead_benchmark_{runtime.value}", + "SubmitKernel", + runtime, + profiler_type, ) def supported_runtimes(self) -> list[RUNTIMES]: @@ -486,9 +517,14 @@ def enabled(self) -> bool: # The benchmark instance gets created just to make metadata for these old results if not super().enabled(): return False - if "bmg" in options.device_architecture and self.KernelExecTime == 20: + + device_arch = getattr(options, "device_architecture", "") + if "bmg" in device_arch and self.KernelExecTime == 20: # Disable this benchmark for BMG server, just create metadata return False + if "bmg" not in device_arch and self.KernelExecTime == 200: + # Disable KernelExecTime=200 for non-BMG systems, just create metadata + return False return True def get_tags(self): @@ -545,7 +581,7 @@ def range(self) -> tuple[float, float]: def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: iters = self.get_iters(run_trace) - return [ + bin_args = [ f"--iterations={iters}", f"--Ioq={self.ioq}", f"--MeasureCompletion={self.MeasureCompletion}", @@ -554,6 +590,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--KernelExecTime={self.KernelExecTime}", f"--UseEvents={self.UseEvents}", ] + if self.runtime == RUNTIMES.SYCL: + bin_args.append(f"--profilerType={self.profiler_type}") + return bin_args def get_metadata(self) -> dict[str, BenchmarkMetadata]: metadata_dict = super().get_metadata() @@ -573,7 +612,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]: class ExecImmediateCopyQueue(ComputeBenchmark): - def __init__(self, bench, ioq, isCopyOnly, source, destination, size): + def __init__( + self, bench, ioq, isCopyOnly, source, destination, size, profiler_type + ): self.ioq = ioq self.isCopyOnly = isCopyOnly self.source = source @@ -582,7 +623,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size): # iterations per bin_args: --iterations=100000 self.iterations_regular = 100000 self.iterations_trace = 10 - super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue") + super().__init__( + bench, + "api_overhead_benchmark_sycl", + "ExecImmediateCopyQueue", + profiler_type=profiler_type, + ) def name(self): order = "in order" if self.ioq else "out of order" @@ -614,11 +660,12 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--dst={self.destination}", f"--size={self.size}", "--withCopyOffload=0", + f"--profilerType={self.profiler_type}", ] class QueueInOrderMemcpy(ComputeBenchmark): - def __init__(self, bench, isCopyOnly, source, destination, size): + def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type): self.isCopyOnly = isCopyOnly self.source = source self.destination = destination @@ -626,7 +673,12 @@ def __init__(self, bench, isCopyOnly, source, destination, size): # iterations per bin_args: --iterations=10000 self.iterations_regular = 10000 self.iterations_trace = 10 - super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy") + super().__init__( + bench, + "memory_benchmark_sycl", + "QueueInOrderMemcpy", + profiler_type=profiler_type, + ) def name(self): return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}" @@ -654,18 +706,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--size={self.size}", "--count=100", "--withCopyOffload=0", + f"--profilerType={self.profiler_type}", ] class QueueMemcpy(ComputeBenchmark): - def __init__(self, bench, source, destination, size): + def __init__(self, bench, source, destination, size, profiler_type): self.source = source self.destination = destination self.size = size # iterations per bin_args: --iterations=10000 self.iterations_regular = 10000 self.iterations_trace = 10 - super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy") + super().__init__( + bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type + ) def name(self): return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}" @@ -689,6 +744,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--sourcePlacement={self.source}", f"--destinationPlacement={self.destination}", f"--size={self.size}", + f"--profilerType={self.profiler_type}", ] @@ -927,6 +983,7 @@ def __init__( inOrderQueue, numKernels, measureCompletionTime, + profiler_type, useEvents, useHostTasks, ): @@ -945,7 +1002,11 @@ def __init__( self.iterations_regular = 10000 self.iterations_trace = 10 super().__init__( - bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime + bench, + f"graph_api_benchmark_{runtime.value}", + "SubmitGraph", + runtime, + profiler_type, ) def explicit_group(self): @@ -974,7 +1035,7 @@ def get_tags(self): def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: iters = self.get_iters(run_trace) - return [ + bin_args = [ f"--iterations={iters}", f"--NumKernels={self.numKernels}", f"--MeasureCompletionTime={self.measureCompletionTime}", @@ -985,17 +1046,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: "--UseExplicit=0", f"--UseHostTasks={self.useHostTasks}", ] + if self.runtime == RUNTIMES.SYCL: + bin_args.append(f"--profilerType={self.profiler_type}") + return bin_args class UllsEmptyKernel(ComputeBenchmark): - def __init__(self, bench, runtime: RUNTIMES, wgc, wgs): + def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type): self.wgc = wgc self.wgs = wgs # iterations per bin_args: --iterations=10000 self.iterations_regular = 10000 self.iterations_trace = 10 super().__init__( - bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime + bench, + f"ulls_benchmark_{runtime.value}", + "EmptyKernel", + runtime, + profiler_type, ) def supported_runtimes(self) -> list[RUNTIMES]: @@ -1020,11 +1088,14 @@ def get_tags(self): def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: iters = self.get_iters(run_trace) - return [ + bin_args = [ f"--iterations={iters}", f"--wgs={self.wgs}", f"--wgc={self.wgc}", ] + if self.runtime == RUNTIMES.SYCL: + bin_args.append(f"--profilerType={self.profiler_type}") + return bin_args class UllsKernelSwitch(ComputeBenchmark):