-
Notifications
You must be signed in to change notification settings - Fork 0
/
task_fir.h
111 lines (94 loc) · 3.07 KB
/
task_fir.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//
// Created by user on 12/12/22.
//
#ifndef NEON_INTRINSICS_EXERCISE_TASK_FIR_H
#define NEON_INTRINSICS_EXERCISE_TASK_FIR_H
#include <array>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
struct FilterInput {
// assume that these fields are correctly initialized
const float* x; // input signal with (N_h-1) zeros appended
size_t inputLength; // N_x
const float* c; // reversed filter coefficients
size_t filterLength; // N_h
float* y; // output (filtered) signal;
// pointer to preallocated, uninitialized memory
size_t outputLength; // should be N_x in our context
};
float* applyFirFilterSingle(FilterInput& input) {
const auto* x = input.x;
const auto* c = input.c;
auto* y = input.y;
for (auto i = 0u; i < input.outputLength; ++i) {
y[i] = 0.f;
for (auto j = 0u; j < input.filterLength; ++j) {
y[i] += x[i + j] * c[j];
}
}
return y;
}
#ifdef __aarch64__
float* applyFirFilterInnerLoopVectorizationARM(FilterInput& input) {
const auto* x = input.x;
const auto* c = input.c;
auto* y = input.y;
for (auto i = 0u; i < input.outputLength; ++i) {
y[i] = 0.f;
float32x4_t outChunk = vdupq_n_f32(0.0f);
for (auto j = 0u; j < input.filterLength; j += 4) {
float32x4_t xChunk = vld1q_f32(x + i + j);
float32x4_t cChunk = vld1q_f32(c + j);
float32x4_t temp = vmulq_f32(xChunk, cChunk);
outChunk = vaddq_f32(outChunk, temp);
}
y[i] = vaddvq_f32(outChunk);
}
return y;
}
float* applyFirFilterOuterLoopVectorizationARM(FilterInput& input) {
const auto* x = input.x;
const auto* c = input.c;
auto* y = input.y;
// Note the increment by 4
for (auto i = 0u; i < input.outputLength; i += 4) {
float32x4_t yChunk{0.0f, 0.0f, 0.0f, 0.0f};
for (auto j = 0u; j < input.filterLength; ++j) {
float32x4_t xChunk = vld1q_f32(x + i + j);
float32x4_t temp = vmulq_n_f32(xChunk, c[j]);
yChunk = vaddq_f32(yChunk, temp);
}
// store to memory
vst1q_f32(y + i, yChunk);
}
return y;
}
float* applyFirFilterOuterInnerLoopVectorizationARM(FilterInput& input)
{
const auto* x = input.x;
const auto* c = input.c;
auto* y = input.y;
const int K = 4;
std::array<float32x4_t, K> outChunk{};
for (auto i = 0u; i < input.outputLength; i += K) {
for(auto k = 0; k < K; ++k){
outChunk[k] = vdupq_n_f32(0.0f);
}
for (auto j = 0u; j < input.filterLength; j += 4) {
float32x4_t cChunk = vld1q_f32(c + j);
for(auto k = 0; k < K; ++k)
{
float32x4_t xChunk = vld1q_f32(x + i + j +k);
float32x4_t temp = vmulq_f32(cChunk, xChunk);
outChunk[k] = vaddq_f32(temp, outChunk[k]);
}
}
for(auto k = 0; k < K; ++k){
y[i + k] = vaddvq_f32(outChunk[k]);
}
}
return input.y;
}
#endif
#endif//NEON_INTRINSICS_EXERCISE_TASK_FIR_H