/
nnet_layer.h
131 lines (113 loc) · 4.59 KB
/
nnet_layer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
//
// rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
//
// Copyright (C) 2017 EJ Kreinar
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
#ifndef NNET_LAYER_H_
#define NNET_LAYER_H_
#include "nnet_common.h"
#include "hls_stream.h"
#include <math.h>
namespace nnet {
struct layer_config
{
// Internal data type definitions
typedef float bias_t;
typedef float weight_t;
typedef float accum_t;
// Layer Sizes
static const unsigned n_in = 10;
static const unsigned n_out = 10;
// Resource reuse info
static const unsigned io_type = io_parallel;
static const unsigned reuse_factor = 1;
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0;
// partitioning arrays cyclically to go with roll factors?
};
template<class data_T, class res_T, typename CONFIG_T>
void compute_layer(
data_T data[CONFIG_T::n_in],
res_T res[CONFIG_T::n_out],
typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
typename CONFIG_T::bias_t biases[CONFIG_T::n_out])
{
data_T cache;
typename CONFIG_T::accum_t mult[CONFIG_T::n_in*CONFIG_T::n_out];
typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
#pragma HLS function_instantiate variable=weights,biases
if (CONFIG_T::io_type == io_parallel){
// For parallel inputs:
// - completely partition arrays -- target fabric
// - if we have an unroll factor, limit number of multipliers
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
// #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
#pragma HLS ARRAY_PARTITION variable=biases complete
#pragma HLS ARRAY_PARTITION variable=mult complete
#pragma HLS ARRAY_PARTITION variable=acc complete
int multiplier_limit = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
} else if (CONFIG_T::io_type == io_serial){
#pragma HLS ARRAY_RESHAPE variable=weights complete dim=1
#pragma HLS ARRAY_PARTITION variable=mult complete dim=1
#pragma HLS ARRAY_PARTITION variable=acc complete dim=1
#pragma HLS DATAFLOW
#pragma HLS STREAM variable=mult depth=1
#pragma HLS STREAM variable=acc depth=1
}
// Do the matrix-multiply
Product1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) {
if (CONFIG_T::io_type == io_serial){
#pragma HLS PIPELINE
}
cache = data[ii];
Product2: for(int jj = 0; jj < CONFIG_T::n_out; jj++) {
if (CONFIG_T::io_type == io_serial) {
int multiplier_limit = ceil(float(CONFIG_T::n_out) / float(CONFIG_T::reuse_factor));
#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
}
int index = ii*CONFIG_T::n_out+jj;
mult[index] = cache * weights[index];
}
}
// Initialize accumulator with input biases
ResetAccum: for(int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
if (CONFIG_T::io_type == io_serial){
#pragma HLS UNROLL
}
acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
}
// Accumulate multiplication result
Accum1: for(int ii = 0; ii < CONFIG_T::n_in; ii++) {
if (CONFIG_T::io_type == io_serial){
#pragma HLS PIPELINE
}
Accum2: for(int jj = 0; jj < CONFIG_T::n_out; jj++) {
int index = ii*CONFIG_T::n_out+jj;
acc[jj] += mult[index];
}
}
// Cast to "res_t" type
Result: for(int ires = 0; ires < CONFIG_T::n_out; ires++){
if (CONFIG_T::io_type == io_serial){
#pragma HLS UNROLL
}
res[ires] = (res_T) (acc[ires]);
}
}
}
#endif