Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 122 additions & 51 deletions devops/scripts/benchmarks/benches/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def git_url(self) -> str:
return "https://github.com/intel/compute-benchmarks.git"

def git_hash(self) -> str:
return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
return "4995560017559849a519e58978a0afdd55903e15"

def setup(self) -> None:
if options.sycl is None:
Expand Down Expand Up @@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
# See SubmitKernel.enabled()
long_kernel_exec_time_ooo = [20, 200]

# The Combo Profiler is available only for selected sycl benchmarks
profiler_types = ["timer", "cpuCounter"]

for runtime in list(RUNTIMES):
# Add SubmitKernel benchmarks using loops
for in_order_queue in [0, 1]:
Expand All @@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
else long_kernel_exec_time_ooo
)
for kernel_exec_time in [1, *long_kernel_exec_time]:
benches.append(
SubmitKernel(
self,
runtime,
in_order_queue,
measure_completion,
use_events,
kernel_exec_time,
for profiler_type in profiler_types:
benches.append(
SubmitKernel(
self,
runtime,
in_order_queue,
measure_completion,
use_events,
kernel_exec_time,
profiler_type,
)
)
)

# Add SinKernelGraph benchmarks
for with_graphs in [0, 1]:
Expand All @@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
)

# Add ULLS benchmarks
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
for profiler_type in profiler_types:
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))

# Add GraphApiSubmitGraph benchmarks
for in_order_queue in [0, 1]:
benches.append(
GraphApiSubmitGraph(
self,
runtime,
in_order_queue,
self.submit_graph_num_kernels[-1],
0,
useEvents=0,
useHostTasks=1,
for profiler_type in profiler_types:
benches.append(
GraphApiSubmitGraph(
self,
runtime,
in_order_queue,
self.submit_graph_num_kernels[-1],
0,
profiler_type,
useEvents=0,
useHostTasks=1,
)
)
)
for num_kernels in self.submit_graph_num_kernels:
for measure_completion_time in [0, 1]:
for use_events in [0, 1]:
benches.append(
GraphApiSubmitGraph(
self,
runtime,
in_order_queue,
num_kernels,
measure_completion_time,
use_events,
useHostTasks=0,
for profiler_type in profiler_types:
benches.append(
GraphApiSubmitGraph(
self,
runtime,
in_order_queue,
num_kernels,
measure_completion_time,
profiler_type,
use_events,
useHostTasks=0,
)
)
)

# Add other benchmarks
benches += [
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
QueueMemcpy(self, "Device", "Device", 1024),
StreamMemory(self, "Triad", 10 * 1024, "Device"),
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
VectorSum(self),
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
]
for profiler_type in profiler_types:
benches.append(
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
)
benches.append(
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type)
)
benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
benches.append(
ExecImmediateCopyQueue(
self, 0, 1, "Device", "Device", 1024, profiler_type
)
)
benches.append(
ExecImmediateCopyQueue(
self, 1, 1, "Device", "Host", 1024, profiler_type
)
)

# Add UR-specific benchmarks
benches += [
Expand Down Expand Up @@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):


class ComputeBenchmark(Benchmark):
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
def __init__(
self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
):
super().__init__(bench.directory, bench)
self.bench = bench
self.bench_name = name
self.test = test
self.runtime = runtime
self.profiler_type = profiler_type
# Mandatory per-benchmark iteration counts.
# Subclasses MUST set both `self.iterations_regular` and
# `self.iterations_trace` (positive ints) in their __init__ before
Expand Down Expand Up @@ -465,6 +491,7 @@ def __init__(
MeasureCompletion=0,
UseEvents=0,
KernelExecTime=1,
profiler_type="",
):
self.ioq = ioq
self.MeasureCompletion = MeasureCompletion
Expand All @@ -475,7 +502,11 @@ def __init__(
self.iterations_regular = 100000
self.iterations_trace = 10
super().__init__(
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
bench,
f"api_overhead_benchmark_{runtime.value}",
"SubmitKernel",
runtime,
profiler_type,
)

def supported_runtimes(self) -> list[RUNTIMES]:
Expand All @@ -486,9 +517,14 @@ def enabled(self) -> bool:
# The benchmark instance gets created just to make metadata for these old results
if not super().enabled():
return False
if "bmg" in options.device_architecture and self.KernelExecTime == 20:

device_arch = getattr(options, "device_architecture", "")
if "bmg" in device_arch and self.KernelExecTime == 20:
# Disable this benchmark for BMG server, just create metadata
return False
if "bmg" not in device_arch and self.KernelExecTime == 200:
# Disable KernelExecTime=200 for non-BMG systems, just create metadata
return False
return True

def get_tags(self):
Expand Down Expand Up @@ -545,7 +581,7 @@ def range(self) -> tuple[float, float]:

def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
iters = self.get_iters(run_trace)
return [
bin_args = [
f"--iterations={iters}",
f"--Ioq={self.ioq}",
f"--MeasureCompletion={self.MeasureCompletion}",
Expand All @@ -554,6 +590,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
f"--KernelExecTime={self.KernelExecTime}",
f"--UseEvents={self.UseEvents}",
]
if self.runtime == RUNTIMES.SYCL:
bin_args.append(f"--profilerType={self.profiler_type}")
return bin_args

def get_metadata(self) -> dict[str, BenchmarkMetadata]:
metadata_dict = super().get_metadata()
Expand All @@ -573,7 +612,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:


class ExecImmediateCopyQueue(ComputeBenchmark):
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
def __init__(
self, bench, ioq, isCopyOnly, source, destination, size, profiler_type
):
self.ioq = ioq
self.isCopyOnly = isCopyOnly
self.source = source
Expand All @@ -582,7 +623,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
# iterations per bin_args: --iterations=100000
self.iterations_regular = 100000
self.iterations_trace = 10
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
super().__init__(
bench,
"api_overhead_benchmark_sycl",
"ExecImmediateCopyQueue",
profiler_type=profiler_type,
)

def name(self):
order = "in order" if self.ioq else "out of order"
Expand Down Expand Up @@ -614,19 +660,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
f"--dst={self.destination}",
f"--size={self.size}",
"--withCopyOffload=0",
f"--profilerType={self.profiler_type}",
]


class QueueInOrderMemcpy(ComputeBenchmark):
def __init__(self, bench, isCopyOnly, source, destination, size):
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
self.isCopyOnly = isCopyOnly
self.source = source
self.destination = destination
self.size = size
# iterations per bin_args: --iterations=10000
self.iterations_regular = 10000
self.iterations_trace = 10
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
super().__init__(
bench,
"memory_benchmark_sycl",
"QueueInOrderMemcpy",
profiler_type=profiler_type,
)

def name(self):
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
Expand Down Expand Up @@ -654,18 +706,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
f"--size={self.size}",
"--count=100",
"--withCopyOffload=0",
f"--profilerType={self.profiler_type}",
]


class QueueMemcpy(ComputeBenchmark):
def __init__(self, bench, source, destination, size):
def __init__(self, bench, source, destination, size, profiler_type):
self.source = source
self.destination = destination
self.size = size
# iterations per bin_args: --iterations=10000
self.iterations_regular = 10000
self.iterations_trace = 10
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
super().__init__(
bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
)

def name(self):
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
Expand All @@ -689,6 +744,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
f"--sourcePlacement={self.source}",
f"--destinationPlacement={self.destination}",
f"--size={self.size}",
f"--profilerType={self.profiler_type}",
]


Expand Down Expand Up @@ -927,6 +983,7 @@ def __init__(
inOrderQueue,
numKernels,
measureCompletionTime,
profiler_type,
useEvents,
useHostTasks,
):
Expand All @@ -945,7 +1002,11 @@ def __init__(
self.iterations_regular = 10000
self.iterations_trace = 10
super().__init__(
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
bench,
f"graph_api_benchmark_{runtime.value}",
"SubmitGraph",
runtime,
profiler_type,
)

def explicit_group(self):
Expand Down Expand Up @@ -974,7 +1035,7 @@ def get_tags(self):

def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
iters = self.get_iters(run_trace)
return [
bin_args = [
f"--iterations={iters}",
f"--NumKernels={self.numKernels}",
f"--MeasureCompletionTime={self.measureCompletionTime}",
Expand All @@ -985,17 +1046,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
"--UseExplicit=0",
f"--UseHostTasks={self.useHostTasks}",
]
if self.runtime == RUNTIMES.SYCL:
bin_args.append(f"--profilerType={self.profiler_type}")
return bin_args


class UllsEmptyKernel(ComputeBenchmark):
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
self.wgc = wgc
self.wgs = wgs
# iterations per bin_args: --iterations=10000
self.iterations_regular = 10000
self.iterations_trace = 10
super().__init__(
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
bench,
f"ulls_benchmark_{runtime.value}",
"EmptyKernel",
runtime,
profiler_type,
)

def supported_runtimes(self) -> list[RUNTIMES]:
Expand All @@ -1020,11 +1088,14 @@ def get_tags(self):

def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
iters = self.get_iters(run_trace)
return [
bin_args = [
f"--iterations={iters}",
f"--wgs={self.wgs}",
f"--wgc={self.wgc}",
]
if self.runtime == RUNTIMES.SYCL:
bin_args.append(f"--profilerType={self.profiler_type}")
return bin_args


class UllsKernelSwitch(ComputeBenchmark):
Expand Down
Loading