-
Notifications
You must be signed in to change notification settings - Fork 0
/
mdrange_gemm_occupancy.cpp
74 lines (72 loc) · 2.55 KB
/
mdrange_gemm_occupancy.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
* idk_just_multiply_matrices
*
* Complexity: high
*
* Tuning problem:
*
* This is a *nested* tuning problem, with some complexity in the
* inner tuning problems.
*
* This simulates a user who doesn't know Kokkos very well,
* telling Kokkos to decide whether to do a matmul using
* a TeamPolicy or an MDRangePolicy, they express no preference.
*
* If you pick an MDRangePolicy, that involves tuning a tile size,
* as referenced in the "deep_copy" benchmark.
*
* If you pick a TeamPolicy, that involves tuning a "team_size"
* and "vector_length," constructs that shape the amount of
* parallelism in different levels of Kokkos.
*
* The "fastest_of" construct exposes a categorical choice among
* implementations. Note that the tuning interface doesn't really
* tell you that you're in a nested context, you'll just see
*
* begin_context(fastest_of_context_id)
* request_values(which_implementation_should_i_use)
* [suppose you say "TeamPolicy"]
* begin_context(team_policy_tuner_id)
* request_values(team_size, vector_length)
* end_context(team_policy_tuner_id)
* end_context(fastest_of_context_id)
*
* This is an extremely difficult problem
*
* Note that this currently involves no features.
*
*/
#include <tuning_playground.hpp>
#include <chrono>
#include <cmath> // cbrt
#include <cstdlib>
#include <iostream>
#include <random>
#include <tuple>
int main(int argc, char *argv[]) {
constexpr const int data_size = 1000;
using view_type =
Kokkos::View<float **, Kokkos::DefaultExecutionSpace::memory_space>;
tuned_kernel(
argc, argv,
[&](const int total_iters) {
view_type left("left_inp", data_size, data_size);
view_type right("right_inp", data_size, data_size);
view_type output("output", data_size, data_size);
return std::make_tuple(left, right, output);
},
[&](const int data_size, const int total_iters,
view_type left, view_type right, view_type output) {
Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace,
Kokkos::Rank<2>> p(
{0, 0}, {data_size, data_size});
auto const p_occ = Kokkos::Experimental::prefer(
p, Kokkos::Experimental::DesiredOccupancy{Kokkos::AUTO});
Kokkos::parallel_for("mdrange_gemm", p_occ,
KOKKOS_LAMBDA(const int x, const int y) {
for (int z = 0; z < data_size; ++z) {
output(x, y) += left(x, z) * right(z, y);
}
});
});
}