/
Generator.py
398 lines (358 loc) · 16.8 KB
/
Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
from texttable import Texttable
from math import ceil
import sys
import numpy as np
from utils import export_tensor
import copy
from collections import OrderedDict
class MVUConfig():
def __init__(self, prec, meminfo, quantIdx, ilength, ijump, wlength, wjump, countdown, olength):
super(MVUConfig, self).__init__()
self.prec = prec
self.meminfo = meminfo
self.quantIdx = quantIdx
self.ilength = ilength
self.ijump = ijump
self.wlength = wlength
self.wjump = wjump
self.countdown = countdown
self.olength = olength
class Generator():
"""docstring for Generator"""
def __init__(self, model, prec, input_shape, meminfo, quantIdx=2, temp_riscv_code_file="template.S"):
super(Generator, self).__init__()
# expecting to receive a OnnxModel parsed object
self.model = model
self.prec = prec
self.input_shape = copy.copy(input_shape)
self.temp_riscv_code_file = temp_riscv_code_file
self.meminfo = meminfo
self.quantIdx = quantIdx
self.loop_cnt = 0
self.mvu_res_output_base_addr = 1024
self.mvu_res_input_base_addr = 0
self.mvu_res_weight_base_addr = 0
def check_model_is_valid(self):
# check if there are residual connections
for layer in self.model.parsed_graph:
# import ipdb as pdb; pdb.set_trace()
if len(layer['input_node']) > 1 and (layer['node_type']!="Reshape"):
# import ipdb as pdb; pdb.set_trace()
print(" ==> Models with residual connections are not supported <==")
print(" ==> :( <==")
sys.exit()
# iprec: Input data precision
# wprec: Weight precision
# oprec: Output precision
# iW : Input width
# iH : Input height
# iC : Input channel blocks
# fW : Filter kernel width
# fH : Filter kernel height
# fC : Number of filter set blocks (i.e. number of output channel blocks)
# sW : Filter horizontal (width) stride
# Pl : Zero-padding in on the left in the width dimension
# Pr : Zero-padding in on the right in the width dimension
# Pt : Zero-padding in on the top in the height dimension
# Pb : Zero-padding in on the bottom in the height dimension
def get_mvu_param(self, prec, iShape, fShape, stride, layerType):
iprec,wprec,oprec = prec
iC, iH, iW = iShape
fC, fH, fW = fShape
sW = stride
ilength = [0,0,0,0,0]
ijump = [0,0,0,0,0]
wlength = [0,0,0,0,0]
wjump = [0,0,0,0,0]
# import ipdb as pdb; pdb.set_trace()
if layerType == "conv":
ilength[4] = 0
ilength[3] = iC*fW-1
ilength[2] = fH-1
ilength[1] = iprec*wprec*fC-1
ijump[4] = 0
ijump[3] = iprec
ijump[2] = iprec*(iC*(iW-fW) + 1)
ijump[1] = -iprec*(iC*(fH-1)*iW + fW*iC - 1)
ijump[0] = -iprec*(iC*(fH-1)*iW + (fW-sW-1)*iC + 1)
wlength[4] = 0
wlength[3] = iC*fW*fH-1
wlength[2] = iprec*wprec-1
wlength[1] = fC-1
wjump[4] = 0
wjump[3] = wprec
wjump[2] = -wprec*(iC*fW*fH-1)
wjump[1] = wprec
wjump[0] = -wprec*(iC*fW*fH*fC-1)
countdown = (iC * fW) * (fH) * (iprec * wprec) * (fC) * ((iW-fW+1)/sW)
elif layerType == "matmul":
m_h = ceil(fH/64)
m_w = ceil(fW/64)
ilength[4] = 0
ilength[3] = 0
ilength[2] = 0
ilength[1] = m_h-1
ijump[4] = 0
ijump[3] = 0
ijump[2] = 0
ijump[1] = iprec
ijump[0] = -iprec*(m_w-1)
wlength[4] = 0
wlength[3] = 0
wlength[2] = m_w-1
wlength[1] = wprec*iprec-1
wjump[4] = 0
wjump[3] = 0
wjump[2] = wprec
wjump[1] = -wprec*(m_w-1)
wjump[0] = wprec
countdown = m_w * m_h * iprec * wprec;
return [ilength, ijump, wlength, wjump, countdown]
def infer_activation_shape(self, input, kernel, padding, stride, layerTpye):
oC, oH, oW = [0, 0, 0]
if layerTpye == "conv":
iC, iH, iW = input
fC, fH, fW = kernel
oH=int((iH-fH+2*padding)/stride)+1
oW=int((iW-fW+2*padding)/stride)+1
oC=fC
elif layerTpye == "matmul":
pass
return [oC, oH, oW]
def _get_riscv_csr_code(self, mvuConfig, layer_type, layer_name, out_channels):
iprec,wprec,oprec = mvuConfig.prec
imem, wmem, omem = mvuConfig.meminfo
def gen_csr_instr(csr, val):
# risc-v immediate CSRs are small and can take values from 0-31.
# here we check if the absolute csr value is larger than 31. If so,
# use regular immediate instructions to load the value into general
# registers and then load them into csr with csrw instruction. If not,
# then that can be handeled with a simple csrwi instruciton.
val = int(val)
if abs(val)>((2**5)-1):
_temp_str = ""
_temp_str += "\taddi t1, {}\n".format(val)
_temp_str += "\tcsrw {} , t1\n".format(csr)
return _temp_str
else:
if val >= 0:
return "\tcsrwi {0}, {1}\n".format(csr, val)
else:
return "\tcsrwi {0}, {1}\n".format(csr, slice_val(val, 5))
# 32 bit negative numbers are represented as 5 bit integer values,
# it is like slicing the value in binary with 5 bits.
def slice_val(val, bits):
def int2bin(integer, digits):
if integer >= 0:
return bin(integer)[2:].zfill(digits)
else:
return bin(2**digits + integer)[2:]
str_bin = int2bin(val, bits)
return int(str_bin[0:bits], 2)
code_str = ""
if layer_type == "matmul":
# Matmul compute structure code
code_str += "{}:\n".format(layer_name)
code_str += "\taddi sp, sp, -4\n"
code_str += "\tsw ra, 4(sp)\n"
code_str += "\tjal {}_init\n".format(layer_name)
code_str += "\tjal {}_loop\n".format(layer_name)
code_str += "\tlw ra, 4(sp)\n"
code_str += "\taddi sp, sp, 4\n"
code_str += "\tret\n"
# Matmul initialization code
code_str += "{}_init:\n".format(layer_name)
code_str += "\taddi sp, sp, -4\n"
code_str += "\tsw ra, 4(sp)\n"
code_str += "\taddi t1, x0, 0\n"
code_str += "\taddi t2, x0, {}\n".format(wprec)
code_str += "\tadd t1, t1, t2\n"
code_str += "\taddi t2, x0, {}\n".format(iprec)
code_str += "\tslli t3, t2, 6\n"
code_str += "\tadd t1, t1, t3\n"
code_str += "\taddi t2, x0, {}\n".format(oprec)
code_str += "\tslli t3, t2, 12\n"
code_str += "\tadd t1, t1, t3\n"
code_str += "\tcsrw mvuprecision, t1\n"
code_str += gen_csr_instr("mvuwjump_0", mvuConfig.wjump[0])
code_str += gen_csr_instr("mvuwjump_1", mvuConfig.wjump[1])
code_str += gen_csr_instr("mvuwjump_2", mvuConfig.wjump[2])
code_str += gen_csr_instr("mvuwjump_3", mvuConfig.wjump[3])
code_str += gen_csr_instr("mvuijump_0", mvuConfig.ijump[0])
code_str += gen_csr_instr("mvuijump_1", mvuConfig.ijump[1])
code_str += gen_csr_instr("mvuijump_2", mvuConfig.ijump[2])
code_str += gen_csr_instr("mvuijump_3", mvuConfig.ijump[3])
code_str += "\tcsrwi {0}, {1}\n".format("mvusjump_0", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvusjump_1", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvubjump_0", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvubjump_1", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvuojump_0", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvuojump_1", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvuojump_2", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvuojump_3", 0)
code_str += "\tcsrwi {0}, {1}\n".format("mvuojump_4", 0)
code_str += gen_csr_instr("mvuwlength_1", mvuConfig.wlength[1])
code_str += gen_csr_instr("mvuwlength_2", mvuConfig.wlength[2])
code_str += gen_csr_instr("mvuwlength_3", mvuConfig.wlength[3])
code_str += gen_csr_instr("mvuwlength_4", mvuConfig.wlength[4])
code_str += gen_csr_instr("mvuilength_1", mvuConfig.ilength[1])
code_str += gen_csr_instr("mvuilength_2", mvuConfig.ilength[2])
code_str += gen_csr_instr("mvuilength_3", mvuConfig.ilength[3])
code_str += gen_csr_instr("mvuilength_4", mvuConfig.ilength[4])
code_str += gen_csr_instr("mvuolength_1", mvuConfig.ilength[1])
code_str += gen_csr_instr("mvuolength_2", mvuConfig.ilength[2])
code_str += gen_csr_instr("mvuolength_3", mvuConfig.ilength[3])
code_str += gen_csr_instr("mvuolength_4", mvuConfig.ilength[4])
code_str += "\tlw ra, 4(sp)\n"
code_str += "\taddi sp, sp, 4\n"
code_str += "\tret\n"
# Matmul loop code
# loop init:
code_str += "{}_loop:\n".format(layer_name)
code_str += "\taddi sp, sp, -4\n"
code_str += "\tsw ra, 4(sp)\n"
code_str += "\taddi s0, x0, {}\n".format(out_channels)
code_str += "\tli s1, {}\n".format(self.mvu_res_output_base_addr)
code_str += "\tli s2, {}\n".format(self.mvu_res_weight_base_addr)
code_str += "\tli s3, {}\n".format(self.mvu_res_input_base_addr)
code_str += "\taddi s4, x0, 1\n"
code_str += "\tslli s4, s4, 30\n"
code_str += "\taddi s4, s4, {}\n".format(int(mvuConfig.countdown))
# loop anchor:
code_str += "loop_{}:\n".format(self.loop_cnt)
code_str += gen_csr_instr("mvuquant", oprec-1)
code_str += "\tcsrw mvuwbaseptr, s2\n"
code_str += "\tcsrw mvuibaseptr, s3\n"
code_str += "\tcsrw mvuobaseptr, s1\n"
code_str += "\tcsrw mvucommand, s4\n"
code_str += "\tjal wait_for_mvu_irq\n"
code_str += "\taddi s0,s0, -1\n"
code_str += "\taddi s1, s1, {}\n".format(oprec)
code_str += "\taddi s3, s3, {}\n".format(iprec)
code_str += "\tbne s0, x0, loop_{}\n".format(self.loop_cnt)
code_str += "\tlw ra, 4(sp)\n"
code_str += "\taddi sp, sp, 4\n"
code_str += "\tret\n"
self.loop_cnt += 1
return code_str
def __gen__riscv_code(self, model, _func_dict):
temp_str = ""
with open(self.temp_riscv_code_file, 'r') as f:
lines = f.readlines()
for line in lines:
if "--> FUNCCALL <--" in line:
for key, value in _func_dict.items():
temp_str += "\tjal {}\n".format(key)
elif "--> HERE <--" in line:
for key, value in _func_dict.items():
temp_str += "{}\n".format(value)
else:
temp_str += line
f.close()
output_file = "{}.S".format(model.model_name)
with open(output_file, 'w') as f:
f.write(temp_str)
f.close()
print("Generated kernel code is written to {}".format(output_file))
def generate_mvu_configs(self):
self.check_model_is_valid()
t = Texttable(max_width=160)
t.add_row(['iShape', 'fShape', 'ilength', 'ijump', 'wlength', 'wjump', 'countdown', 'total layer countdown'])
# import ipdb as pdb; pdb.set_trace()
input_shape = self.input_shape
input_shape[0] = ceil(input_shape[0]/64)
total_cycles = 0
_func_dict = OrderedDict()
for layer in self.model.layers:
_code_str = ""
layer_type = layer['layer_type']
layer_name = layer['layer_name']
iShape = input_shape
fShape = [ceil(layer['out_channels']/64), layer['kernel_size'][0], layer['kernel_size'][1]]
stride = layer['stride'][0]
padding = layer['padding'][0]
prec = self.prec
# print("{} * {}".format(iShape, fShape))
# import ipdb as pdb; pdb.set_trace()
ilength, ijump, wlength, wjump, countdown = self.get_mvu_param(prec, iShape, fShape, stride, layer_type)
olength = [0,1,0,0,0]
mvuConfig = MVUConfig(self.prec, self.meminfo, self.quantIdx, ilength, ijump, wlength, wjump, countdown, olength)
# import ipdb as pdb; pdb.set_trace()
_code_str += self._get_riscv_csr_code(mvuConfig, layer_type, layer_name, layer['out_channels'])
_func_dict[layer_name] = _code_str
if layer_type == "conv":
total_layer_countdown = countdown * ceil((input_shape[2]+layer['padding'][0]+layer['padding'][1]) / layer['stride'][1])
elif layer_type == "matmul":
total_layer_countdown = countdown
# import ipdb as pdb; pdb.set_trace()
t.add_row([iShape, fShape, ilength, ijump, wlength, wjump, countdown, total_layer_countdown])
input_shape = self.infer_activation_shape(input_shape, fShape, padding, stride, layer_type)
total_cycles += total_layer_countdown
self.__gen__riscv_code(self.model, _func_dict)
print("\nGenerated MVU configuration:")
print(t.draw())
print("Total countdown: {}".format(int(total_cycles)))
def print_mvu_param(self):
pass
def int2bit(self, val, precision):
val_str = '{0:032b}'.format(val)
return list(val_str[::-1][0:precision][::-1])
def export_weigths(self):
weight_dict = self.__process_weigths()
iprec,wprec,oprec = self.prec
for tensor_name, tensor in weight_dict.items():
export_tensor(tensor, format="linear", prec=wprec, tensor_name=tensor_name, numerical_system=None)
def __export_weigths(self, dict):
for key, vals in dict.items():
file_name = "{}.hex".format(key)
with open(file_name, "w") as f:
for val in vals:
f.write("{}\n".format(val))
f.close()
print("Exporting {} to {}".format(key, file_name))
def __process_weigths(self):
iprec,wprec,oprec = self.prec
# expecting input shapes is:
# [input_channels, output_channels, width, height]
weight_ram = {}
for layer in self.model.layers:
layer_weights = []
layer_type = layer['layer_type']
# first we need to transpose the weight tensor into channel first format
# import ipdb as pdb; pdb.set_trace()
if layer_type == "conv":
weights = layer['weight'].transpose(3,2,1,0)
elif layer_type == "matmul":
# import ipdb as pdb; pdb.set_trace()
weights = layer['weight'].transpose()
# weights = layer['weight']
# The accelerator only works with integer values
flatten_weights = [int(val) for val in weights.flatten()]
# print(flatten_weights)
weight_block = np.zeros([4096, wprec],dtype=str)
cnt = 0
processed = False
# Now we need to transpose the weight tensor into MVU format.
for idx, val in enumerate(flatten_weights):
if cnt >= 4096 or idx==(len(flatten_weights)-1):
if idx==(len(flatten_weights)-1):
# import ipdb as pdb; pdb.set_trace()
weight_block[cnt] = self.int2bit(val, wprec)
cnt = 0
# import ipdb as pdb; pdb.set_trace()
for weight in weight_block.transpose(1,0):
val_str = "".join(weight)
val_str = val_str.zfill(4096)[::-1]
layer_weights.append(val_str)
processed = True
weight_block[cnt] = self.int2bit(val, wprec)
cnt += 1
# import ipdb as pdb; pdb.set_trace()
if not processed:
# import ipdb as pdb; pdb.set_trace()
for weight in weight_block.transpose(1,0):
val_str = "".join(weight)
#val_str = val_str[::-1].zfill(4096)
layer_weights.append(val_str)
weight_ram[layer['name']] = layer_weights
return weight_ram