/
stencil_chain_generator.cpp
150 lines (124 loc) · 5.52 KB
/
stencil_chain_generator.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include "Halide.h"
namespace {
class StencilChain : public Halide::Generator<StencilChain> {
public:
GeneratorParam<int> stencils{"stencils", 32, 1, 100};
Input<Buffer<uint16_t>> input{"input", 2};
Output<Buffer<uint16_t>> output{"output", 2};
void generate() {
std::vector<Func> stages;
Var x("x"), y("y");
Func f = Halide::BoundaryConditions::repeat_edge(input);
stages.push_back(f);
for (int s = 0; s < (int)stencils; s++) {
Func f("stage_" + std::to_string(s));
Expr e = cast<uint16_t>(0);
for (int i = -2; i <= 2; i++) {
for (int j = -2; j <= 2; j++) {
e += ((i + 3) * (j + 3)) * stages.back()(x + i, y + j);
}
}
f(x, y) = e;
stages.push_back(f);
}
output(x, y) = stages.back()(x, y);
/* ESTIMATES */
// (This can be useful in conjunction with RunGen and benchmarks as well
// as auto-schedule, so we do it in all cases.)
{
const int width = 1536;
const int height = 2560;
// Provide estimates on the input image
input.set_estimates({{0, width}, {0, height}});
// Provide estimates on the pipeline output
output.set_estimates({{0, width}, {0, height}});
}
if (auto_schedule) {
// nothing
} else if (get_target().has_gpu_feature()) {
// GPU schedule
// 2.9 ms on a 2060 RTX
// It seems that just compute-rooting all the stencils is
// fastest on this GPU, plus some unrolling and aggressive
// staging to share loads between adjacent pixels.
Var xi, yi, xii, yii;
stages.pop_back(); // Inline the second-last stage into the output
stages.push_back(output);
for (size_t i = 1; i < stages.size(); i++) {
Func &s = stages[i];
Func prev = stages[i - 1];
x = s.args()[0];
y = s.args()[1];
s.compute_root()
.gpu_tile(x, y, xi, yi, 30 * 2, 12)
.tile(xi, yi, xii, yii, 2, 2)
.unroll(xii)
.unroll(yii);
// Pre-load the entire region required of the previous
// stage into shared memory by adding a wrapper Func
// and scheduling it at blocks. This way instead of
// every pixel doing 25 loads from global memory, many of
// which overlap, we load each unique value from
// global into shared once, and then we use faster
// loads from shared in the actual stencil.
prev.in()
.compute_at(s, x)
.tile(prev.args()[0], prev.args()[1], xi, yi, 2, 2)
.vectorize(xi)
.unroll(yi)
.gpu_threads(prev.args()[0], prev.args()[1]);
// A similar benefit applies for the
// vectorized/unrolled 2x2 tiles. Instead of having
// each unrolled iteration do its own mix of scalar
// and vector loads from shared memory in a 5x5
// window, many of which get deduped across the block,
// we load a 6x6 window of shared into registers using
// only aligned vector loads, and then the actual
// stencil pulls from those registers. We're adding
// another wrapper Func around the wrapper Func we
// created above, so we say .in().in()
prev.in()
.in()
.compute_at(s, xi)
.vectorize(prev.args()[0], 2)
.unroll(prev.args()[0])
.unroll(prev.args()[1]);
}
} else {
// CPU schedule
// 4.23ms on an Intel i9-9960X using 16 threads at 3.5
// GHz.
// Runtime is pretty noisy, so benchmarked over 1000
// trials instead of the default of 10 in the
// Makefile. This uses AVX-512 instructions, but not
// floating-point ones. My CPU seems to hover at 3.5GHz on
// this workload.
const int vec = natural_vector_size<uint16_t>();
// How many stencils in between each compute-root
const int group_size = 11;
Var yi, yo, xo, xi, t;
const int last_stage_idx = (int)stages.size() - 1;
for (int j = last_stage_idx; j > 0; j -= group_size) {
Func out = (j == last_stage_idx) ? output : stages[j];
const int stages_to_output = last_stage_idx - j;
const int expansion = 4 * stages_to_output;
const int w = 1536 + expansion;
const int h = 2560 + expansion;
out.compute_root()
// Break into 16 tiles for our 16 threads
.tile(x, y, xo, yo, xi, yi, w / 4, h / 4)
.fuse(xo, yo, t)
.parallel(t)
.vectorize(xi, vec);
for (int i = std::max(0, j - group_size + 1); i < j; i++) {
Func s = stages[i];
s.store_at(out, t)
.compute_at(out, yi)
.vectorize(s.args()[0], vec);
}
}
}
}
};
} // namespace
HALIDE_REGISTER_GENERATOR(StencilChain, stencil_chain)