Merging occupancy tuning changes from David Polikoff.

Note: This is a re-commit of a somehow polluted branch when I rebased on develop. I started over with the 5 changed files. The old Kokkos fork/branch from : davidp git@github.com:DavidPoliakoff/kokkos.git (fetch) was merged with current Kokkos develop, and tested with ArborX to confirm that autotuning occupancy for the DBSCAN benchmark worked. In tests on a system with V100, the original benchmark when iterated 600 times took 119.064 seconds to run. During the tuning process (using simulated annealing), the runtime was 108.014 seconds. When using cached results, the runtime was 109.058 seconds. The converged occupancy value was 70. Here are the cached results from APEX autotuning: Input_1: name: kokkos.kernel_name id: 1 info.type: string info.category: categorical info.valueQuantity: unbounded info.candidates: unbounded num_bins: 0 Input_2: name: kokkos.kernel_type id: 2 info.type: string info.category: categorical info.valueQuantity: set info.candidates: [parallel_for,parallel_reduce,parallel_scan,parallel_copy] Output_3: name: ArborX::Experimental::HalfTraversal id: 3 info.type: int64 info.category: ratio info.valueQuantity: range info.candidates: lower: 5 upper: 100 step: 5 open upper: 0 open lower: 0 Context_0: Name: "[2:parallel_for,1:ArborX::Experimental::HalfTraversal,tree_node:default]" Converged: true Results: NumVars: 1 id: 3 value: 70 In manual experiments, the ArborX team determined that the optimal occupancy for this example was beetween 40-90, which were a 10% improvement over baseline default of 100. See arborx/ArborX#815 for details. One deviation from the branch that David had written - the occupancy range is [5-100], with a step size of 5. The original implementation in Kokkos used [1-100] with a step size of 1.
kokkos · Mar 11, 2024 · 8dba118 · 8dba118
1 parent 35ad698
commit 8dba118
Show file tree

Hide file tree

Showing 5 changed files with 705 additions and 11 deletions.
diff --git a/core/src/Kokkos_Parallel.hpp b/core/src/Kokkos_Parallel.hpp
@@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy,
                          const FunctorType& functor) {
   uint64_t kpID = 0;
 
-  ExecPolicy inner_policy = policy;
-  Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID);
+  /** Request a tuned policy from the tools subsystem */
+  const auto& response =
+      Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID);
+  const auto& inner_policy = response.policy;
 
   Kokkos::Impl::shared_allocation_tracking_disable();
   Impl::ParallelFor<FunctorType, ExecPolicy> closure(functor, inner_policy);
@@ -349,8 +351,10 @@ template <class ExecutionPolicy, class FunctorType,
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor) {
   uint64_t kpID                = 0;
-  ExecutionPolicy inner_policy = policy;
-  Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
+  /** Request a tuned policy from the tools subsystem */
+  const auto& response =
+      Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID);
+  const auto& inner_policy = response.policy;
 
   Kokkos::Impl::shared_allocation_tracking_disable();
   Impl::ParallelScan<FunctorType, ExecutionPolicy> closure(functor,

diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp
@@ -1492,9 +1492,11 @@ struct ParallelReduceAdaptor {
     using PassedReducerType = typename return_value_adapter::reducer_type;
     uint64_t kpID           = 0;
 
-    PolicyType inner_policy = policy;
-    Kokkos::Tools::Impl::begin_parallel_reduce<PassedReducerType>(
-        inner_policy, functor, label, kpID);
+    /** Request a tuned policy from the tools subsystem */
+    auto response = Kokkos::Tools::Impl::begin_parallel_reduce<
+        typename return_value_adapter::reducer_type>(policy, functor, label,
+                                                     kpID);
+    auto& inner_policy = response.policy;
 
     using ReducerSelector =
         Kokkos::Impl::if_c<std::is_same<InvalidType, PassedReducerType>::value,

diff --git a/core/src/Kokkos_Tuners.hpp b/core/src/Kokkos_Tuners.hpp
@@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t);
 VariableValue make_variable_value(size_t, double);
 SetOrRange make_candidate_range(double lower, double upper, double step,
                                 bool openLower, bool openUpper);
+SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step,
+                                bool openLower, bool openUpper);
 size_t get_new_context_id();
 void begin_context(size_t context_id);
 void end_context(size_t context_id);
@@ -419,10 +421,11 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
   template <typename ViableConfigurationCalculator, typename Functor,
             typename TagType, typename... Properties>
   TeamSizeTuner(const std::string& name,
-                Kokkos::TeamPolicy<Properties...>& policy,
+                const Kokkos::TeamPolicy<Properties...>& policy_in,
                 const Functor& functor, const TagType& tag,
                 ViableConfigurationCalculator calc) {
     using PolicyType           = Kokkos::TeamPolicy<Properties...>;
+    PolicyType policy(policy_in);
     auto initial_vector_length = policy.impl_vector_length();
     if (initial_vector_length < 1) {
       policy.impl_set_vector_length(1);
@@ -504,7 +507,8 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
   }
 
   template <typename... Properties>
-  void tune(Kokkos::TeamPolicy<Properties...>& policy) {
+  auto tune(const Kokkos::TeamPolicy<Properties...>& policy_in) {
+    Kokkos::TeamPolicy<Properties...> policy(policy_in);
     if (Kokkos::Tools::Experimental::have_tuning_tool()) {
       auto configuration = tuner.begin();
       auto team_size     = std::get<1>(configuration);
@@ -514,6 +518,117 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
         policy.impl_set_vector_length(vector_length);
       }
     }
+    return policy;
+  }
+  void end() {
+    if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+      tuner.end();
+    }
+  }
+
+  TunerType get_tuner() const { return tuner; }
+};
+namespace Impl {
+template <class T>
+struct tuning_type_for;
+
+template <>
+struct tuning_type_for<double> {
+  static constexpr Kokkos::Tools::Experimental::ValueType value =
+      Kokkos::Tools::Experimental::ValueType::kokkos_value_double;
+  static double get(
+      const Kokkos::Tools::Experimental::VariableValue& value_struct) {
+    return value_struct.value.double_value;
+  }
+};
+template <>
+struct tuning_type_for<int64_t> {
+  static constexpr Kokkos::Tools::Experimental::ValueType value =
+      Kokkos::Tools::Experimental::ValueType::kokkos_value_int64;
+  static int64_t get(
+      const Kokkos::Tools::Experimental::VariableValue& value_struct) {
+    return value_struct.value.int_value;
+  }
+};
+}  // namespace Impl
+template <class Bound>
+class SingleDimensionalRangeTuner {
+  size_t id;
+  size_t context;
+  using tuning_util = Impl::tuning_type_for<Bound>;
+
+  Bound default_value;
+
+ public:
+  SingleDimensionalRangeTuner() = default;
+  SingleDimensionalRangeTuner(
+      const std::string& name,
+      Kokkos::Tools::Experimental::StatisticalCategory category,
+      Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) {
+    default_value = default_val;
+    Kokkos::Tools::Experimental::VariableInfo info;
+    info.category   = category;
+    info.candidates = make_candidate_range(
+        static_cast<Bound>(lower), static_cast<Bound>(upper),
+        static_cast<Bound>(step), false, false);
+    info.valueQuantity =
+        Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range;
+    info.type = tuning_util::value;
+    id        = Kokkos::Tools::Experimental::declare_output_type(name, info);
+  }
+
+  Bound begin() {
+    context = Kokkos::Tools::Experimental::get_new_context_id();
+    Kokkos::Tools::Experimental::begin_context(context);
+    auto tuned_value =
+        Kokkos::Tools::Experimental::make_variable_value(id, default_value);
+    Kokkos::Tools::Experimental::request_output_values(context, 1,
+                                                       &tuned_value);
+    return tuning_util::get(tuned_value);
+  }
+
+  void end() { Kokkos::Tools::Experimental::end_context(context); }
+
+  template <typename Functor>
+  void with_tuned_value(Functor& func) {
+    func(begin());
+    end();
+  }
+};
+
+class RangePolicyOccupancyTuner {
+ private:
+  using TunerType = SingleDimensionalRangeTuner<int64_t>;
+  TunerType tuner;
+
+ public:
+  RangePolicyOccupancyTuner()        = default;
+  RangePolicyOccupancyTuner& operator=(const RangePolicyOccupancyTuner& other) =
+      default;
+  RangePolicyOccupancyTuner(const RangePolicyOccupancyTuner& other) = default;
+  RangePolicyOccupancyTuner& operator=(RangePolicyOccupancyTuner&& other) =
+      default;
+  RangePolicyOccupancyTuner(RangePolicyOccupancyTuner&& other) = default;
+  template <typename ViableConfigurationCalculator, typename Functor,
+            typename TagType, typename... Properties>
+  RangePolicyOccupancyTuner(const std::string& name,
+                            const Kokkos::RangePolicy<Properties...>&,
+                            const Functor&, const TagType&,
+                            ViableConfigurationCalculator)
+      : tuner(TunerType(name,
+                        Kokkos::Tools::Experimental::StatisticalCategory::
+                            kokkos_value_ratio,
+                        100, 5, 100, 5)) {}
+
+  template <typename... Properties>
+  auto tune(const Kokkos::RangePolicy<Properties...>& policy_in) {
+    Kokkos::RangePolicy<Properties...> policy(policy_in);
+    if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+      auto occupancy = tuner.begin();
+      policy.impl_set_desired_occupancy(
+          Kokkos::Experimental::DesiredOccupancy{static_cast<int>(occupancy)});
+    }
+    return policy;
   }
   void end() {
     if (Kokkos::Tools::Experimental::have_tuning_tool()) {
@@ -577,11 +692,13 @@ struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
     policy.impl_change_tile_size({std::get<Indices>(tuple)...});
   }
   template <typename... Properties>
-  void tune(Kokkos::MDRangePolicy<Properties...>& policy) {
+  auto tune(const Kokkos::MDRangePolicy<Properties...>& policy_in) {
+    Kokkos::MDRangePolicy<Properties...> policy(policy_in);
     if (Kokkos::Tools::Experimental::have_tuning_tool()) {
       auto configuration = tuner.begin();
       set_policy_tile(policy, configuration, std::make_index_sequence<rank>{});
     }
+    return policy;
   }
   void end() {
     if (Kokkos::Tools::Experimental::have_tuning_tool()) {