-
Notifications
You must be signed in to change notification settings - Fork 0
/
idk_jmm.cpp
101 lines (98 loc) · 3.5 KB
/
idk_jmm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/**
* idk_just_multiply_matrices
*
* Complexity: high
*
* Tuning problem:
*
* This is a *nested* tuning problem, with some complexity in the
* inner tuning problems.
*
* This simulates a user who doesn't know Kokkos very well,
* telling Kokkos to decide whether to do a matmul using
* a TeamPolicy or an MDRangePolicy, they express no preference.
*
* If you pick an MDRangePolicy, that involves tuning a tile size,
* as referenced in the "deep_copy" benchmark.
*
* If you pick a TeamPolicy, that involves tuning a "team_size"
* and "vector_length," constructs that shape the amount of
* parallelism in different levels of Kokkos.
*
* The "fastest_of" construct exposes a categorical choice among
* implementations. Note that the tuning interface doesn't really
* tell you that you're in a nested context, you'll just see
*
* begin_context(fastest_of_context_id)
* request_values(which_implementation_should_i_use)
* [suppose you say "TeamPolicy"]
* begin_context(team_policy_tuner_id)
* request_values(team_size, vector_length)
* end_context(team_policy_tuner_id)
* end_context(fastest_of_context_id)
*
* This is an extremely difficult problem
*
* Note that this currently involves no features.
*
*/
#include <tuning_playground.hpp>
#include <chrono>
#include <cmath> // cbrt
#include <cstdlib>
#include <iostream>
#include <random>
#include <tuple>
int main(int argc, char *argv[]) {
constexpr const int data_size = 1024;
using view_type =
Kokkos::View<float **, Kokkos::DefaultExecutionSpace::memory_space>;
Kokkos::initialize(argc, argv);
{
Kokkos::print_configuration(std::cout, false);
view_type left("left_inp", data_size, data_size);
view_type right("right_inp", data_size, data_size);
view_type output("output", data_size, data_size);
for (int i = 0 ; i < Impl::max_iterations ; i++) {
fastest_of(
"bad_gemms",
[&]() {
//std::cout << i << " Doing team gemm..." << std::endl;
using team_policy =
Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;
using team_member = team_policy::member_type;
Kokkos::parallel_for(
"bad_team_gemm",
team_policy(data_size * data_size, Kokkos::AUTO,
Kokkos::AUTO),
KOKKOS_LAMBDA(const team_member &member) {
auto index = member.league_rank();
auto x = index % data_size;
auto y = index / data_size;
float sum = 0;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, data_size),
[&](int &i, float &lsum) {
lsum += left(x, i) * right(i, y);
},
sum);
output(x, y) = sum;
});
},
[&]() {
//std::cout << i << " Doing mdrange gemm..." << std::endl;
Kokkos::parallel_for(
"bad_mdrange_gemm",
Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace,
Kokkos::Rank<2>>(
{0, 0}, {data_size, data_size}),
KOKKOS_LAMBDA(const int x, const int y) {
for (int z = 0; z < data_size; ++z) {
output(x, y) += left(x, z) * right(z, y);
}
});
});
}
}
Kokkos::finalize();
}