-
Notifications
You must be signed in to change notification settings - Fork 2
/
bench_merklize.hpp
72 lines (59 loc) · 2.18 KB
/
bench_merklize.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#pragma once
#include "merklize.hpp"
#include <cassert>
#include <random>
void
benchmark_merklize(sycl::queue& q,
size_t leaf_cnt,
size_t wg_size,
sycl::cl_ulong* const ts)
{
// this implementation is only helpful when
// relatively large number of leaf nodes are
// required to be merklized
assert(leaf_cnt >= (1 << 20));
const size_t i_size = leaf_cnt << 5;
const size_t o_size = leaf_cnt << 5;
// allocate resources
sycl::uint* i_h = static_cast<sycl::uint*>(sycl::malloc_host(i_size, q));
sycl::uint* o_h = static_cast<sycl::uint*>(sycl::malloc_host(o_size, q));
sycl::uint* i_d = static_cast<sycl::uint*>(sycl::malloc_device(i_size, q));
sycl::uint* o_d = static_cast<sycl::uint*>(sycl::malloc_device(o_size, q));
// Set all intermediate nodes to zero bytes,
//
// I'll make use of this fact later to assert that first 32 -bytes will never
// be touched by any work-item
q.memset(o_d, 0, o_size).wait();
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<uint8_t> dis(0, 255);
memset(i_h, dis(gen), i_size); // prepare (random) input bytes
}
sycl::cl_ulong ts_0, ts_1, ts_2;
// copy input from host to device
sycl::event evt_0 = q.memcpy(i_d, i_h, i_size);
evt_0.wait();
// time host to device tx command
ts_0 = time_event(evt_0);
// merklization, get sum of all dispatched kernel execution time
ts_1 = merklize(q, i_d, i_size, leaf_cnt, o_d, o_size, leaf_cnt - 1, wg_size);
// copy output from device to host
sycl::event evt_1 = q.memcpy(o_h, o_d, o_size);
evt_1.wait();
// time device to host data tx command
ts_2 = time_event(evt_1);
// ensuring that first 32 -bytes are never touched by any work-items
for (size_t i = 0; i < (blake3::OUT_LEN >> 2); i++) {
assert(*(o_h + i) == 0);
}
// ensure all acquired resources are deallocated too !
sycl::free(i_h, q);
sycl::free(o_h, q);
sycl::free(i_d, q);
sycl::free(o_d, q);
// all time in nanosecond level granularity
*(ts + 0) = ts_0; // host to device data transfer time
*(ts + 1) = ts_1; // total kernel execution cost
*(ts + 2) = ts_2; // device to host data transfer time
}