/
uint16_neon.h
173 lines (143 loc) · 4.88 KB
/
uint16_neon.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include <arm_neon.h>
#include <cstdint>
namespace NEO {
struct uint16x16_t {
enum { numChannels = 16 };
uint16x8x2_t value;
uint16x16_t() {
value.val[0] = vdupq_n_u16(0);
value.val[1] = vdupq_n_u16(0);
}
uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
value.val[0] = lo;
value.val[1] = hi;
}
uint16x16_t(uint16_t a) {
value.val[0] = vdupq_n_u16(a);
value.val[1] = vdupq_n_u16(a);
}
explicit uint16x16_t(const void *alignedPtr) {
load(alignedPtr);
}
inline uint16_t get(unsigned int element) {
DEBUG_BREAK_IF(element >= numChannels);
uint16_t result;
// vgetq_lane requires constant immediate
switch (element) {
case 0:
result = vgetq_lane_u16(value.val[0], 0);
break;
case 1:
result = vgetq_lane_u16(value.val[0], 1);
break;
case 2:
result = vgetq_lane_u16(value.val[0], 2);
break;
case 3:
result = vgetq_lane_u16(value.val[0], 3);
break;
case 4:
result = vgetq_lane_u16(value.val[0], 4);
break;
case 5:
result = vgetq_lane_u16(value.val[0], 5);
break;
case 6:
result = vgetq_lane_u16(value.val[0], 6);
break;
case 7:
result = vgetq_lane_u16(value.val[0], 7);
break;
case 8:
result = vgetq_lane_u16(value.val[1], 0);
break;
case 9:
result = vgetq_lane_u16(value.val[1], 1);
break;
case 10:
result = vgetq_lane_u16(value.val[1], 2);
break;
case 11:
result = vgetq_lane_u16(value.val[1], 3);
break;
case 12:
result = vgetq_lane_u16(value.val[1], 4);
break;
case 13:
result = vgetq_lane_u16(value.val[1], 5);
break;
case 14:
result = vgetq_lane_u16(value.val[1], 6);
break;
case 15:
result = vgetq_lane_u16(value.val[1], 7);
break;
}
return result;
}
static inline uint16x16_t zero() {
return uint16x16_t(static_cast<uint16_t>(0u));
}
static inline uint16x16_t one() {
return uint16x16_t(static_cast<uint16_t>(1u));
}
static inline uint16x16_t mask() {
return uint16x16_t(static_cast<uint16_t>(0xffffu));
}
inline void load(const void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
}
inline void store(void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
}
inline operator bool() const {
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
uint64x2_t tmp = vorrq_u64(hi, lo);
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
return result;
}
inline uint16x16_t &operator-=(const uint16x16_t &a) {
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline uint16x16_t &operator+=(const uint16x16_t &a) {
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = veorq_u16(mask().value.val[0],
vcgtq_u16(b.value.val[0], a.value.val[0]));
result.value.val[1] = veorq_u16(mask().value.val[1],
vcgtq_u16(b.value.val[1], a.value.val[1]));
return result;
}
inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
return result;
}
// NOTE: uint16x16_t::blend behaves like mask ? a : b
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
uint16x16_t result;
result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
return result;
}
};
} // namespace NEO