From bfc68039d5230fb4cc8897443db3486f63470d00 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 5 Jan 2023 15:16:14 +0100 Subject: [PATCH 01/20] #5: Create blas2 gemv benchmark test --- perf_test/CMakeLists.txt | 1 + .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 122 ++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index d46b85b4d7..fc2ddc5d62 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -61,5 +61,6 @@ if(KokkosKernels_ENABLE_BENCHMARK) blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp + blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp ) endif() diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp new file mode 100644 index 0000000000..a0fbff639f --- /dev/null +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -0,0 +1,122 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosBlas2_gemv.hpp" +#include + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto repeat = state.range(2); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n"; + + // Create a View containing a 2D matrix; allocate KokkosView with template + // args of Scalar**, a layout, and + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + // Create Views containing 1D matrix; allocate (without) matrix "x" of size n + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), n); + // Create Views containing 1D matrix; allocate (without) matrix "y" of size m + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m); + + // Declaring variable pool w/ a number seed; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + // Fill 2D Matrix "A" and 1D matrix (i.e., a vector) "x" with random values; + // Here, 10 is the max value of the random generator between 1 and 10 + // (uniform ) + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(x, pool, 10.0); + + for (auto _ : state) { + // Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + + // Start timing + Kokkos::fence(); + Kokkos::Timer timer; + for (int i = 0; i < repeat; i++) { + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation + size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg GEMV time: %f s.\n", avg); + printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg GEMV time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg GEMV FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->Name("KokkosBlas2_gemv") + ->ArgNames({"m", "n", "repeat"}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); From 3b8c2da3d0e174aadc38a63d6e1335d9d9841abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:14:35 +0100 Subject: [PATCH 02/20] Remove redundant output - remove redundant print statements - use meaningful benchmark name and pass configuration via arguments --- .../KokkosBlas_dot_mv_perf_test_benchmark.cpp | 2 -- .../KokkosBlas_dot_perf_test_benchmark.cpp | 2 -- ...okkosBlas_team_dot_perf_test_benchmark.cpp | 2 -- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 21 ++++++------------- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 1e537ceadc..c0a01eaff5 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -124,8 +124,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m * n; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 14957994d1..fd4513d7d2 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -122,8 +122,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 165f7fe6db..2764da9556 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -128,8 +128,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index a0fbff639f..69796d7132 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -48,23 +48,17 @@ #include "KokkosBlas2_gemv.hpp" #include -template -static void run(benchmark::State& state) { +template +static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); const auto repeat = state.range(2); // Declare type aliases + using ExecSpace = Kokkos::DefaultExecutionSpace; using Scalar = double; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" - << ExecSpace::name() << ")\n"; - - std::cout << "Each test input vector has a length of " << m << std::endl; - - std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n"; - // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and Kokkos::View A( @@ -104,8 +98,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; - printf("Avg GEMV time: %f s.\n", avg); - printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg GEMV time (s):"] = @@ -115,8 +107,7 @@ static void run(benchmark::State& state) { } } -BENCHMARK(run) - ->Name("KokkosBlas2_gemv") - ->ArgNames({"m", "n", "repeat"}) - ->Args({5000, 5000, 1}) +BENCHMARK(KokkosBlas2_gemv) + ->ArgNames({"m", "n", "repeat", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1, 1}) ->UseManualTime(); From e87d532c2a48dd0f8315469567342316df076739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:29:24 +0100 Subject: [PATCH 03/20] Let benchmark decide the number of repetitions --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 69796d7132..596a774073 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -50,14 +50,14 @@ template static void KokkosBlas2_gemv(benchmark::State& state) { - const auto m = state.range(0); - const auto n = state.range(1); - const auto repeat = state.range(2); + const auto m = state.range(0); + const auto n = state.range(1); + // Declare type aliases using ExecSpace = Kokkos::DefaultExecutionSpace; - using Scalar = double; - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and @@ -88,26 +88,23 @@ static void KokkosBlas2_gemv(benchmark::State& state) { // Start timing Kokkos::fence(); Kokkos::Timer timer; - for (int i = 0; i < repeat; i++) { - KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); - ExecSpace().fence(); - } + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); // Kokkos Timer set up - double total = timer.seconds(); - double avg = total / repeat; + double time = timer.seconds(); // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; state.SetIterationTime(timer.seconds()); state.counters["Avg GEMV time (s):"] = - benchmark::Counter(avg, benchmark::Counter::kDefaults); + benchmark::Counter(time, benchmark::Counter::kDefaults); state.counters["Avg GEMV FLOP/s:"] = - benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + benchmark::Counter(flopsPerRun / time, benchmark::Counter::kDefaults); } } BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", "repeat", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1, 1}) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) ->UseManualTime(); From 0678b55b19ca3a5d022c712e962edb7368132b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:46:05 +0100 Subject: [PATCH 04/20] Include scalar type in the output --- .../blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 596a774073..adf1e51c59 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -48,14 +48,13 @@ #include "KokkosBlas2_gemv.hpp" #include -template +template static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); // Declare type aliases using ExecSpace = Kokkos::DefaultExecutionSpace; - using Scalar = double; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; @@ -104,7 +103,7 @@ static void KokkosBlas2_gemv(benchmark::State& state) { } } -BENCHMARK(KokkosBlas2_gemv) +BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); From 6d027010ab47f2dc41d79122d56700658b99543b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:50:51 +0100 Subject: [PATCH 05/20] Let benchmark calculate FLOP/s --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index adf1e51c59..3cb85dcadf 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -98,8 +98,8 @@ static void KokkosBlas2_gemv(benchmark::State& state) { state.counters["Avg GEMV time (s):"] = benchmark::Counter(time, benchmark::Counter::kDefaults); - state.counters["Avg GEMV FLOP/s:"] = - benchmark::Counter(flopsPerRun / time, benchmark::Counter::kDefaults); + state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } } From 7336d9c2fd4647c63daf90cc4c4cb4bab5e3c012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 14:07:11 +0100 Subject: [PATCH 06/20] Add a benchmark for LayoutRight --- .../blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 3cb85dcadf..550115ce94 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -107,3 +107,8 @@ BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); + +BENCHMARK(KokkosBlas2_gemv) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); From b3da125585d21b8b9bfeb053814b699a1f7b2343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 19:31:11 +0100 Subject: [PATCH 07/20] Use correct header --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 38 +++---------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 550115ce94..fa583e624d 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -1,46 +1,18 @@ -/* //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ //@HEADER -*/ #include #include From 278d18fac954fefc1dce51300c04034cfa896087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 20:15:18 +0100 Subject: [PATCH 08/20] Use stored time value --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index fa583e624d..6e62fe09fc 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -66,7 +66,7 @@ static void KokkosBlas2_gemv(benchmark::State& state) { double time = timer.seconds(); // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(timer.seconds()); + state.SetIterationTime(time); state.counters["Avg GEMV time (s):"] = benchmark::Counter(time, benchmark::Counter::kDefaults); From 6c21c4df2585b0f6f3ac29b5acd466ee27ec0e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 28 Mar 2023 17:26:36 +0200 Subject: [PATCH 09/20] Revert changes to blas1 benchmark --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp | 2 ++ perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 2 ++ .../blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index c0a01eaff5..1e537ceadc 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -124,6 +124,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index fd4513d7d2..14957994d1 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -122,6 +122,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 2764da9556..165f7fe6db 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -128,6 +128,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = From 24923b79e40d2069f08890c1e645eb9ad03630d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 17:54:44 +0200 Subject: [PATCH 10/20] Use separate executable --- cmake/kokkoskernels_benchmarks.cmake | 4 +--- perf_test/CMakeLists.txt | 2 +- perf_test/blas/blas2/CMakeLists.txt | 7 +++++++ .../blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 14 ++++++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake index 7bb262247d..3a38feee88 100644 --- a/cmake/kokkoskernels_benchmarks.cmake +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -30,8 +30,6 @@ ELSE() TARGET_COMPILE_OPTIONS(benchmark_main PRIVATE -w) ENDIF() -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) CMAKE_PARSE_ARGUMENTS( BENCHMARK @@ -53,7 +51,7 @@ FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) ADD_EXECUTABLE( ${BENCHMARK_NAME} - ${CMAKE_SOURCE_DIR}/perf_test/BenchmarkMain.cpp ${BENCHMARK_SOURCES} + ${BENCHMARK_SOURCES} ) TARGET_LINK_LIBRARIES( ${BENCHMARK_NAME} diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index fc2ddc5d62..cf1905d6d4 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -61,6 +61,6 @@ if(KokkosKernels_ENABLE_BENCHMARK) blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp - blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp + BenchmarkMain.cpp ) endif() diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt index f69c576cd3..9c2aa424d1 100644 --- a/perf_test/blas/blas2/CMakeLists.txt +++ b/perf_test/blas/blas2/CMakeLists.txt @@ -5,3 +5,10 @@ KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas2_gemv_perf_test SOURCES KokkosBlas2_gemv_perf_test.cpp ) + +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas2_Benchmark + SOURCES KokkosBlas2_gemv_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 6e62fe09fc..d116e3fdd2 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -18,6 +18,7 @@ #include #include "KokkosBlas2_gemv.hpp" +#include #include template @@ -84,3 +85,16 @@ BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} From 10dc298b515404dda135d05bdf044e434b66a1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:00:44 +0200 Subject: [PATCH 11/20] Move warm-up out of benchmarking loop --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index d116e3fdd2..652d7ae806 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -53,24 +53,24 @@ static void KokkosBlas2_gemv(benchmark::State& state) { Kokkos::fill_random(A, pool, 10.0); Kokkos::fill_random(x, pool, 10.0); - for (auto _ : state) { - // Do a warm-up run - KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + // Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + Kokkos::fence(); + double total_time = 0.0; + for (auto _ : state) { // Start timing - Kokkos::fence(); Kokkos::Timer timer; KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); ExecSpace().fence(); - // Kokkos Timer set up double time = timer.seconds(); - // Flops calculation + total_time += time; size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(time); + state.SetIterationTime(time); state.counters["Avg GEMV time (s):"] = - benchmark::Counter(time, benchmark::Counter::kDefaults); + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } From 1d70e7aebe5e1c13aa80a514c35ab91eaff63b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:20:52 +0200 Subject: [PATCH 12/20] Parse common parameters --- .../blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 652d7ae806..0a5c18a7f1 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -18,6 +18,10 @@ #include #include "KokkosBlas2_gemv.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + #include #include @@ -92,6 +96,15 @@ int main(int argc, char** argv) { benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + benchmark::RegisterBenchmark("KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); + benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); From 03728a8b8ddd2f301c0a4b1a839dbc6d4d973903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:31:17 +0200 Subject: [PATCH 13/20] Use CMake helper for ODE_RK benchmark --- perf_test/ode/CMakeLists.txt | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt index 67d0c421fb..b4aa86889f 100644 --- a/perf_test/ode/CMakeLists.txt +++ b/perf_test/ode/CMakeLists.txt @@ -2,22 +2,7 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) if(KOKKOSKERNELS_ENABLE_BENCHMARK) - SET(BENCHMARK_NAME ${PACKAGE_NAME}_ode_runge_kutta) - - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - KokkosODE_RK.cpp - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} + KOKKOSKERNELS_ADD_BENCHMARK( + ode_runge_kutta SOURCES KokkosODE_RK.cpp ) endif() From f38b56ab13db8c6da8d757c11640b9ca5c87efde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:35:17 +0200 Subject: [PATCH 14/20] Let benchmark decide number of iterations Let benchmark decide how many iterations will be run when --repeat is not provided. --- perf_test/ode/KokkosODE_RK.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index 4f6e53e143..e9dc3f2f8e 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -353,8 +353,7 @@ int main(int argc, char** argv) { run_benchmark_wrapper, argc, argv) ->UseRealTime() ->ArgNames({"n", "model"}) - ->Args({1000, 1}) - ->Iterations(common_params.repeat); + ->Args({1000, 1}); } benchmark::RunSpecifiedBenchmarks(); From 34a228689178971f210e0a585981b82cdfb30df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 19:35:26 +0200 Subject: [PATCH 15/20] Parse blas2 custom command line parameters --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 112 ++++++++++++++---- 1 file changed, 87 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 0a5c18a7f1..14e67a803d 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -25,13 +25,47 @@ #include #include -template +struct blas2_gemv_params : public perf_test::CommonInputParams { + int m = 5000; + int n = 5000; + // bool layoutLeft = true; +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; +} + +blas2_gemv_params parse_blas2_gemv_options(int& argc, char** argv) { + blas2_gemv_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return params; + } + } + return params; +} + +template static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); // Declare type aliases - using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; @@ -70,25 +104,16 @@ static void KokkosBlas2_gemv(benchmark::State& state) { double time = timer.seconds(); total_time += time; - size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(time); - state.counters["Avg GEMV time (s):"] = - benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); - state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( - flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } -} -BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); - -BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); + state.counters[ExecSpace::name()] = 1; + state.counters["Avg GEMV time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)2 * m * n; + state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); +} int main(int argc, char** argv) { Kokkos::initialize(argc, argv); @@ -96,14 +121,51 @@ int main(int argc, char** argv) { benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); - perf_test::CommonInputParams common_params; - perf_test::parse_common_options(argc, argv, common_params); + const auto params = parse_blas2_gemv_options(argc, argv); + const auto arg_names = std::vector{"m", "n"}; + const auto args = std::vector{params.m, params.n}; + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } - benchmark::RegisterBenchmark("KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); + if (true) { // serial +#if defined(KOKKOS_ENABLE_SERIAL) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames({"m", "n"}) + ->Args({params.m, params.n}) + ->UseManualTime(); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } benchmark::RunSpecifiedBenchmarks(); From 3324854864331e09d554026f43f3ec39e33a5a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 20:45:48 +0200 Subject: [PATCH 16/20] Add registration wrapper --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 14e67a803d..93349fa061 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -115,24 +115,39 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } +void register_benchmark(const char* name, void (*func)(benchmark::State&), + std::vector arg_names, + std::vector args, int repeat) { + if (repeat > 0) { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime() + ->Iterations(repeat); + } else { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); + } +} + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); + const auto name = "KokkosBlas2_gemv"; const auto params = parse_blas2_gemv_options(argc, argv); const auto arg_names = std::vector{"m", "n"}; const auto args = std::vector{params.m, params.n}; if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -141,12 +156,9 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; @@ -155,12 +167,9 @@ int main(int argc, char** argv) { if (true) { // serial #if defined(KOKKOS_ENABLE_SERIAL) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames({"m", "n"}) - ->Args({params.m, params.n}) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 35ee9ee7ed8fda903db10c5479483cff93362b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 12:30:35 +0200 Subject: [PATCH 17/20] Fix formatting --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 93349fa061..44fcd20e16 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -66,8 +66,8 @@ static void KokkosBlas2_gemv(benchmark::State& state) { const auto n = state.range(1); // Declare type aliases - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and From 5d237f8b6e0c8e9c0f5fc62fefafceb36f091587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 16:30:42 +0200 Subject: [PATCH 18/20] Support all command line parameters --- perf_test/Benchmark_Context.hpp | 18 ++ .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 191 ++++++++++++------ 2 files changed, 150 insertions(+), 59 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 16a7d4c4e8..e81b158d93 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -95,6 +95,24 @@ inline void add_benchmark_context(bool verbose = false) { add_version_info(); } +inline void register_benchmark(const char* name, + void (*func)(benchmark::State&), + std::vector arg_names, + std::vector args, int repeat) { + if (repeat > 0) { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime() + ->Iterations(repeat); + } else { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); + } +} + } // namespace KokkosKernelsBenchmark #endif diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 44fcd20e16..f37ddf3dc1 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -26,39 +26,55 @@ #include struct blas2_gemv_params : public perf_test::CommonInputParams { - int m = 5000; - int n = 5000; - // bool layoutLeft = true; -}; - -void print_options() { - std::cerr << "Options\n" << std::endl; - std::cerr << perf_test::list_common_options(); + int m = 5000; + int n = 5000; + bool layoutLeft = true; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; -} + static blas2_gemv_params get_params(int& argc, char** argv) { + blas2_gemv_params params; + perf_test::parse_common_options(argc, argv, params); -blas2_gemv_params parse_blas2_gemv_options(int& argc, char** argv) { - blas2_gemv_params params; - perf_test::parse_common_options(argc, argv, params); - - for (int i = 1; i < argc; ++i) { - if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { - ++i; - } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { - ++i; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return params; + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (std::string layout; + perf_test::check_arg_str(i, argc, argv, "--layout", layout)) { + if (0 == Test::string_compare_no_case(layout, "left")) + params.layoutLeft = true; + else if (0 == Test::string_compare_no_case(layout, "right")) + params.layoutLeft = false; + else { + std::cerr << "Invalid layout: must be 'left' or 'right'.\n"; + exit(1); + } + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } } + return params; } - return params; -} + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --m :: number of rows to generate (default 5000)" + << std::endl; + std::cerr + << "\t[Optional] --n :: number of cols to generate (default 5000)" + << std::endl; + std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', " + "default 'left')" + << std::endl; + } +}; template static void KokkosBlas2_gemv(benchmark::State& state) { @@ -115,23 +131,6 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } -void register_benchmark(const char* name, void (*func)(benchmark::State&), - std::vector arg_names, - std::vector args, int repeat) { - if (repeat > 0) { - benchmark::RegisterBenchmark(name, func) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime() - ->Iterations(repeat); - } else { - benchmark::RegisterBenchmark(name, func) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); - } -} - int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); @@ -139,15 +138,37 @@ int main(int argc, char** argv) { KokkosKernelsBenchmark::add_benchmark_context(true); const auto name = "KokkosBlas2_gemv"; - const auto params = parse_blas2_gemv_options(argc, argv); - const auto arg_names = std::vector{"m", "n"}; - const auto args = std::vector{params.m, params.n}; + const auto params = blas2_gemv_params::get_params(argc, argv); + const auto arg_names = std::vector{ + "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.m, params.n, 1}; + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -156,20 +177,72 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; #endif } - if (true) { // serial + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda and !params.use_hip and !params.use_openmp and + !params.use_sycl and !params.use_threads) { #if defined(KOKKOS_ENABLE_SERIAL) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 15d61698300b4fcd400846b7ff04a4e12f018d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 18:32:33 +0200 Subject: [PATCH 19/20] Reduce duplication --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 90 ++++++------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index f37ddf3dc1..962328eb95 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -77,7 +77,7 @@ struct blas2_gemv_params : public perf_test::CommonInputParams { }; template -static void KokkosBlas2_gemv(benchmark::State& state) { +static void KokkosBlas2_GEMV(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); @@ -131,28 +131,37 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } +template +void run(const blas2_gemv_params& params) { + using Scalar = double; + + const auto name = "KokkosBlas2_GEMV"; + const auto arg_names = std::vector{ + "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.m, params.n, 1}; + + if (params.layoutLeft) { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } else { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } +} + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); - const auto name = "KokkosBlas2_gemv"; - const auto params = blas2_gemv_params::get_params(argc, argv); - const auto arg_names = std::vector{ - "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; - const auto args = std::vector{params.m, params.n, 1}; + const auto params = blas2_gemv_params::get_params(argc, argv); if (params.use_threads) { #if defined(KOKKOS_ENABLE_THREADS) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: PThreads requested, but not available.\n"; return 1; @@ -161,14 +170,7 @@ int main(int argc, char** argv) { if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -177,14 +179,7 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; @@ -193,18 +188,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; @@ -213,18 +197,7 @@ int main(int argc, char** argv) { if (params.use_sycl) { #if defined(KOKKOS_ENABLE_SYCL) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: SYCL requested, but not available.\n"; return 1; @@ -235,14 +208,7 @@ int main(int argc, char** argv) { if (!params.use_cuda and !params.use_hip and !params.use_openmp and !params.use_sycl and !params.use_threads) { #if defined(KOKKOS_ENABLE_SERIAL) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 97187c3af919b321f7bfffd190f8e54773618460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 12 Apr 2023 22:50:56 +0200 Subject: [PATCH 20/20] Allow passing additional arguments --- perf_test/Benchmark_Context.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index e81b158d93..3cfefbc057 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -95,18 +95,21 @@ inline void add_benchmark_context(bool verbose = false) { add_version_info(); } -inline void register_benchmark(const char* name, - void (*func)(benchmark::State&), +template +inline void register_benchmark(const char* name, FuncType func, std::vector arg_names, - std::vector args, int repeat) { + std::vector args, int repeat, + ArgsToCallOp&&... func_args) { if (repeat > 0) { - benchmark::RegisterBenchmark(name, func) + benchmark::RegisterBenchmark(name, func, + std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime() ->Iterations(repeat); } else { - benchmark::RegisterBenchmark(name, func) + benchmark::RegisterBenchmark(name, func, + std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime();