/
RegDeps.cpp
1791 lines (1590 loc) · 64.2 KB
/
RegDeps.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "RegDeps.hpp"
#include "../asserts.hpp"
#include "../bits.hpp"
#include <algorithm>
#include <cstring>
#include <limits>
#include <sstream>
using namespace iga;
static DEP_CLASS getClassFromPipeType(DEP_PIPE type, const OpSpec &opspec) {
if (opspec.is(Op::SYNC) || opspec.op == Op::ILLEGAL)
return DEP_CLASS::OTHER;
switch (type) {
case DEP_PIPE::NONE:
case DEP_PIPE::SHORT:
case DEP_PIPE::LONG:
case DEP_PIPE::CONTROL_FLOW:
return DEP_CLASS::IN_ORDER;
case DEP_PIPE::SEND:
case DEP_PIPE::MATH:
return DEP_CLASS::OUT_OF_ORDER;
case DEP_PIPE::FLOAT:
case DEP_PIPE::INTEGER:
case DEP_PIPE::LONG64:
case DEP_PIPE::MATH_INORDER:
return DEP_CLASS::IN_ORDER;
case DEP_PIPE::DPAS:
case DEP_PIPE::SEND_SLM:
case DEP_PIPE::SEND_UNKNOWN:
return DEP_CLASS::OUT_OF_ORDER;
}
return DEP_CLASS::NONE;
}
static void setDEPPipeClass_SingleDistPipe(DepSet &dep,
const Instruction &inst) {
auto opsec = inst.getOpSpec();
dep.setDepPipe(DEP_PIPE::SHORT);
if (opsec.is(Op::MATH)) {
dep.setDepPipe(DEP_PIPE::MATH);
} else if (opsec.isAnySendFormat()) {
dep.setDepPipe(DEP_PIPE::SEND);
} else if (opsec.isBranching()) {
dep.setDepPipe(DEP_PIPE::CONTROL_FLOW);
} else {
for (uint32_t i = 0; i < inst.getSourceCount(); ++i) {
const auto &src = inst.getSource(i);
if (src.getType() == Type::DF || src.getType() == Type::Q ||
src.getType() == Type::UQ) {
dep.setDepPipe(DEP_PIPE::LONG);
break;
}
}
if (opsec.supportsDestination()) {
const auto &dst = inst.getDestination();
if (dst.getType() == Type::DF || dst.getType() == Type::Q ||
dst.getType() == Type::UQ) {
dep.setDepPipe(DEP_PIPE::LONG);
}
}
}
dep.setDepClass(getClassFromPipeType(dep.getDepPipe(), opsec));
}
// XeHP+
static void setSendPipeType(DEP_PIPE &pipe_type, const Instruction &inst,
const Model &model) {
assert(inst.getOpSpec().isAnySendFormat());
pipe_type = DEP_PIPE::SEND;
// XeHPG+: slm send should be considered in different pipes
// Check if it's SLM
if (model.platform >= Platform::XE_HPG) {
if (inst.getMsgDescriptor().isReg())
pipe_type = DEP_PIPE::SEND_UNKNOWN;
else {
SFID sfid = inst.getSendFc();
if (sfid == SFID::SLM) {
pipe_type = DEP_PIPE::SEND_SLM;
} else {
uint32_t desc = inst.getMsgDescriptor().imm;
bool btiSlm = (0xFE == getBits<uint32_t>(desc, 0, 8));
bool scratchBlockMsg = (1 == getBits<uint32_t>(desc, 18, 1));
bool headerPresent = (1 == getBits<uint32_t>(desc, 19, 1));
bool sidebandOffsetEn = (1 == getBits<uint32_t>(desc, 7, 1));
if ((sfid == SFID::DC0 && btiSlm && !scratchBlockMsg) ||
(sfid == SFID::DC1 && btiSlm) ||
(sfid == SFID::DC2 && !headerPresent && sidebandOffsetEn))
pipe_type = DEP_PIPE::SEND_SLM;
}
}
}
}
// MTL
static void setDEPPipeClass_ThreeDistPipeDPMath(DepSet &dep,
const Instruction &inst,
const Model &model) {
// The same as ThreeDistPipe, only that instructions with DP (fp64) types
// will be in Math pipe
auto opsec = inst.getOpSpec();
DEP_PIPE pipe_type = DEP_PIPE::NONE;
if (opsec.is(Op::MATH)) {
pipe_type = DEP_PIPE::MATH;
} else if (opsec.isAnySendFormat()) {
setSendPipeType(pipe_type, inst, model);
} else if (opsec.isDpasFormat()) {
pipe_type = DEP_PIPE::DPAS;
} else if (opsec.isBranching()) {
pipe_type = DEP_PIPE::INTEGER;
} else {
// In order instruction:
// if destination type is FP32/FP16/BF16 then it goes to float pipe,
// if destination type is int32 / 16 / 8 it goes to integer pipe and
// if destination or source type is int64 then it goes to long pipe
// if destination or source type is FP64 then it goes to out-of-order Math
// pipe for conversion instructions float2int goes to integer pipe and
// int2float goes to float pipe, If destination type is null then source0
// data type will determine the pipe
Type inst_type = Type::INVALID;
if (opsec.supportsDestination())
inst_type = inst.getDestination().getType();
else if (inst.getSourceCount())
inst_type = inst.getSource(0).getType();
if (inst_type != Type::INVALID) {
if (TypeIs64b(inst_type)) {
if (TypeIsFloating(inst_type))
pipe_type = DEP_PIPE::MATH;
else
pipe_type = DEP_PIPE::LONG64;
} else if (TypeIsFloating(inst_type))
pipe_type = DEP_PIPE::FLOAT;
else
pipe_type = DEP_PIPE::INTEGER;
}
// any of src has 64-bit type --> math or long pipe
for (uint32_t i = 0; i < inst.getSourceCount(); ++i) {
const auto &src = inst.getSource(i);
if (TypeIs64b(src.getType())) {
if (TypeIsFloating(src.getType()))
pipe_type = DEP_PIPE::MATH;
else
pipe_type = DEP_PIPE::LONG64;
break;
}
}
}
// default set to Integer pipe (e.g. NOP)
if (pipe_type == DEP_PIPE::NONE)
pipe_type = DEP_PIPE::INTEGER;
dep.setDepClass(getClassFromPipeType(pipe_type, opsec));
dep.setDepPipe(pipe_type);
}
// XeHP
static void setDEPPipeClass_ThreeDistPipe(DepSet &dep, const Instruction &inst,
const Model &model) {
auto opsec = inst.getOpSpec();
DEP_PIPE pipe_type = DEP_PIPE::NONE;
if (opsec.is(Op::MATH)) {
pipe_type = DEP_PIPE::MATH;
} else if (opsec.isAnySendFormat()) {
setSendPipeType(pipe_type, inst, model);
} else if (opsec.isDpasFormat()) {
pipe_type = DEP_PIPE::DPAS;
} else if (opsec.isBranching()) {
pipe_type = DEP_PIPE::INTEGER;
} else {
// In order instruction:
// if destination type is FP32/FP16/BF16 then it goes to float pipe,
// if destination type is int32 / 16 / 8 it goes to integer pipe and
// if destination or source type is int64/FP64 then it goes to long pipe
// for conversion instructions float2int goes to integer pipe and int2float
// goes to float pipe, anything2doublefloat goes to long pipe and
// doublefloat2anything goes to long pipe
// If destination type is null then source0 data type will determine the
// pipe
Type inst_type = Type::INVALID;
if (opsec.supportsDestination())
inst_type = inst.getDestination().getType();
else if (inst.getSourceCount())
inst_type = inst.getSource(0).getType();
if (inst_type != Type::INVALID) {
if (TypeIs64b(inst_type))
pipe_type = DEP_PIPE::LONG64;
else if (TypeIsFloating(inst_type))
pipe_type = DEP_PIPE::FLOAT;
else
pipe_type = DEP_PIPE::INTEGER;
}
for (uint32_t i = 0; i < inst.getSourceCount(); ++i) {
const auto &src = inst.getSource(i);
if (TypeIs64b(src.getType())) {
pipe_type = DEP_PIPE::LONG64;
break;
}
}
}
// default set to Integer pipe (e.g. NOP)
if (pipe_type == DEP_PIPE::NONE)
pipe_type = DEP_PIPE::INTEGER;
dep.setDepClass(getClassFromPipeType(pipe_type, opsec));
dep.setDepPipe(pipe_type);
}
// XeHPC+
static void setDEPPipeClass_FourDistPipe(DepSet &dep, const Instruction &inst,
const Model &model) {
setDEPPipeClass_ThreeDistPipe(dep, inst, model);
if (dep.getDepPipe() == DEP_PIPE::MATH) {
dep.setDepClass(DEP_CLASS::IN_ORDER);
dep.setDepPipe(DEP_PIPE::MATH_INORDER);
}
}
static void setDEPPipeClass_FourDistPipeReduction(DepSet &dep,
const Instruction &inst,
const Model &model) {
// The difference between FourDistPipe and FourDistPipeReduction
// is that for FourDistPipeReduction: only fp64-dst-instructions goto Long
// pipe (LONG64), and for FourDistPipe: instructions having 64b types on src
// or dst goto Long pipe.
auto opsec = inst.getOpSpec();
DEP_PIPE pipe_type = DEP_PIPE::NONE;
if (opsec.is(Op::MATH)) {
dep.setDepClass(DEP_CLASS::IN_ORDER);
dep.setDepPipe(DEP_PIPE::MATH_INORDER);
return;
} else if (opsec.isAnySendFormat()) {
setSendPipeType(pipe_type, inst, model);
} else if (opsec.isDpasFormat()) {
pipe_type = DEP_PIPE::DPAS;
} else if (opsec.isBranching()) {
pipe_type = DEP_PIPE::INTEGER;
} else {
// In order instruction:
// RegDistFloat: Specify distance dependancy in float32 pipe
// (i.e. all instructions with float32/16/8 dst)
// RegDistLong: Specify distance dependancy in float64 pipe
// (i.e. all instructions with float64 dst)
// RegDistInt: Specify distance dependancy in Int32/16/8 and Int64 pipe
// (i.e. all instructions with integer dst)
// RegDistMath: Specify distance dependancy in Math pipe
// (i.e. all math instructions)
Type inst_type = Type::INVALID;
// only instructions with f64 dst go into Long, others with f32/16/8
// go into FLOAT
if (opsec.supportsDestination()) {
inst_type = inst.getDestination().getType();
if (TypeIsFloating(inst_type)) {
if (TypeIs64b(inst_type))
pipe_type = DEP_PIPE::LONG64;
else
pipe_type = DEP_PIPE::FLOAT;
} else {
pipe_type = DEP_PIPE::INTEGER;
}
}
}
// default set to Integer pipe (e.g. NOP)
if (pipe_type == DEP_PIPE::NONE)
pipe_type = DEP_PIPE::INTEGER;
dep.setDepClass(getClassFromPipeType(pipe_type, opsec));
dep.setDepPipe(pipe_type);
}
static void setDEPPipeClass(SWSB_ENCODE_MODE enc_mode, DepSet &dep,
const Instruction &inst, const Model &model) {
switch (enc_mode) {
case SWSB_ENCODE_MODE::SingleDistPipe:
setDEPPipeClass_SingleDistPipe(dep, inst);
break;
case SWSB_ENCODE_MODE::ThreeDistPipeDPMath:
setDEPPipeClass_ThreeDistPipeDPMath(dep, inst, model);
break;
case SWSB_ENCODE_MODE::ThreeDistPipe:
setDEPPipeClass_ThreeDistPipe(dep, inst, model);
break;
case SWSB_ENCODE_MODE::FourDistPipe:
setDEPPipeClass_FourDistPipe(dep, inst, model);
break;
case SWSB_ENCODE_MODE::FourDistPipeReduction:
setDEPPipeClass_FourDistPipeReduction(dep, inst, model);
break;
default:
break;
}
}
DepSet::DepSet(const InstIDs &instIdCntr, const DepSetBuilder &dsb)
: m_instruction(nullptr), m_dType(DEP_TYPE::NONE), m_hasIndirect(false),
m_hasSR(false), m_dPipe(DEP_PIPE::NONE), m_dClass(DEP_CLASS::NONE),
m_InstIDs(instIdCntr.global, instIdCntr.inOrder, instIdCntr.floatPipe,
instIdCntr.intPipe, instIdCntr.longPipe, instIdCntr.mathPipe
),
m_DB(dsb) {
m_bucketList.reserve(4);
bits = new BitSet<>(dsb.getTOTAL_BITS());
}
uint32_t DepSet::getDPASOpsPerChan(Type src1_ty, Type src2_ty, bool isDF) {
// get OPS_PER_CHAN, the number of dot product operations per dword channel,
// depending on element type
if (isDF)
return 1;
if (src1_ty == Type::HF || src1_ty == Type::BF) {
IGA_ASSERT(src1_ty == src2_ty, "dpas: invalid src1/src2 types combination");
return 2;
} else if (src1_ty == Type::BF8 || src1_ty == Type::HF8) {
IGA_ASSERT(src2_ty == Type::BF8 || src2_ty == Type::HF8,
"dpas: invalid src1/src2 types combination");
return 4;
} else if (src1_ty == Type::TF32) {
IGA_ASSERT(src1_ty == src2_ty, "dpas: invalid src1/src2 types combination");
return 1;
} else {
// if both src1 and src2 are int2 or int4, than ops_per_chan will be 8
int src1_size = TypeSizeInBits(src1_ty);
int src2_size = TypeSizeInBits(src2_ty);
// Type: ub, b, u4, s4, u2, s2
IGA_ASSERT((src1_size <= 8), "OPS_PER_CHAN: unsupported src1 type");
IGA_ASSERT((src2_size <= 8), "OPS_PER_CHAN: unsupported src2 type");
if ((src1_size == 2 || src1_size == 4) &&
(src2_size == 2 || src2_size == 4))
return 8;
return 4;
}
}
// lowBound - start register address offset in byte
// UpBound - upper register address offset in byte
uint32_t DepSet::getDPASSrcDepUpBound(unsigned idx, Type srcType,
uint32_t execSize, uint32_t lowBound,
uint32_t systolicDepth,
uint32_t repeatCount,
uint32_t opsPerChan) {
auto typeSizeInBits = TypeSizeInBitsWithDefault(srcType, 32);
// elements_size is the size of total elements to be calculated in one
// operation
uint32_t elements_size = execSize * typeSizeInBits / 8;
uint32_t upBound = lowBound;
if (idx == 0)
upBound += elements_size * repeatCount;
else if (idx == 1)
upBound += elements_size * opsPerChan * systolicDepth;
else
upBound += (repeatCount - 1) * opsPerChan * 8 * typeSizeInBits /
8 + /* start offset of the last repeated block */
opsPerChan * systolicDepth * typeSizeInBits /
8; /* size of used register in last repeated block */
return upBound;
}
void DepSet::getDpasSrcDependency(const Instruction &inst,
RegRangeListType ®_range,
RegRangeListType &extra_regs,
const Model &model) {
uint32_t execSize = static_cast<uint32_t>(inst.getExecSize());
IGA_ASSERT((!inst.isDF() && execSize == (m_DB.getGRF_BYTES_PER_REG() / 4)) ||
(inst.isDF() && execSize == 8),
"Invalid ExecSize for this op");
// check src operand and add the dependency
uint32_t repeatCount = GetDpasRepeatCount(inst.getDpasFc());
uint32_t systolicDepth = GetDpasSystolicDepth(inst.getDpasFc());
uint32_t ops_per_chan = getDPASOpsPerChan(
inst.getSource(1).getType(), inst.getSource(2).getType(), inst.isDF());
for (unsigned srcIx = 0; srcIx < inst.getSourceCount(); ++srcIx) {
const Operand &op = inst.getSource(srcIx);
// the src0 could be null, in that case no need to set the dependency
if (srcIx == 0 && op.getDirRegName() == RegName::ARF_NULL) {
// if src0 is null, set the reg range to max() to specify its actually
// empty
reg_range.push_back(std::make_pair(std::numeric_limits<uint32_t>::max(),
std::numeric_limits<uint32_t>::max()));
continue;
}
IGA_ASSERT(op.getDirRegName() == RegName::GRF_R,
"GRF or null required on this op");
// calculate register region
auto tType = op.getType();
auto typeSizeInBits = TypeSizeInBitsWithDefault(tType, 32);
uint32_t lowBound =
addressOf(op.getDirRegName(), op.getDirRegRef(), typeSizeInBits);
uint32_t upBound =
getDPASSrcDepUpBound(srcIx, tType, execSize, lowBound, systolicDepth,
repeatCount, ops_per_chan);
IGA_ASSERT(upBound >= lowBound, "source region footprint computation got "
"it wrong: upBound is less than lowBound");
uint32_t startRegNum = lowBound / m_DB.getGRF_BYTES_PER_REG();
uint32_t upperRegNum = (upBound - 1) / m_DB.getGRF_BYTES_PER_REG();
reg_range.push_back(std::make_pair(startRegNum, upperRegNum));
// calculate extra_regs for HW workaround: treat Src2 as dpas.8x8 when
// calculating register footpring (4 registers)
if (m_DB.needDPASSrc2WA()) {
if (srcIx == 2 && (repeatCount != 8 || systolicDepth != 8)) {
uint32_t extraUpBound = getDPASSrcDepUpBound(
srcIx, tType, execSize, lowBound, 8, 8, ops_per_chan);
uint32_t extraUpRegNum =
(extraUpBound - 1) / m_DB.getGRF_BYTES_PER_REG();
if (extraUpRegNum >= m_DB.getGRF_REGS())
IGA_FATAL("IGA RegDeps: DPAS src2 out of bounds due to HW WA");
extra_regs.push_back(std::make_pair(startRegNum, extraUpRegNum));
}
}
// calculate extra_regs for HW workaround: src1 always have 8 register
// footpring
if (m_DB.needDPASSrc1WA()) {
if (srcIx == 1) {
uint32_t extraUpBound = lowBound + m_DB.getGRF_BYTES_PER_REG() * 8;
uint32_t extraUpRegNum =
(extraUpBound - 1) / m_DB.getGRF_BYTES_PER_REG();
if (extraUpRegNum >= m_DB.getGRF_REGS())
IGA_FATAL("IGA RegDeps: DPAS src1 out of bounds due to HW WA");
extra_regs.push_back(std::make_pair(startRegNum, extraUpRegNum));
}
}
}
}
void DepSet::addDependency(const RegRangeType ®_range) {
for (uint32_t regNum = reg_range.first; regNum <= reg_range.second;
regNum++) {
addGrf(regNum);
addToBucket(regNum);
}
// Using one of the special registers to add write dependency in to special
// bucket This way it will always check that implicit dependency
m_bucketList.push_back(m_DB.getBucketStart(RegName::ARF_CR));
}
void DepSet::addDependency(const RegRangeListType ®_range) {
for (RegRangeType pair : reg_range) {
// when range is max(), which means it's null, skip it
if (pair.first == std::numeric_limits<uint32_t>::max())
continue;
for (uint32_t regNum = pair.first; regNum <= pair.second; regNum++) {
addGrf(regNum);
addToBucket(regNum);
}
}
// Using one of the special registers to add read dependency into special
// bucket This way it will always check that implicit dependency
m_bucketList.push_back(m_DB.getBucketStart(RegName::ARF_CR));
}
bool DepSetBuilder::needDstReadSuppressionWA(const Instruction &inst) const {
if (mPlatformModel.platform != Platform::XE_HPG)
return false;
if (inst.getOpSpec().is(Op::MATH))
return true;
if (inst.isDF())
return true;
return false;
}
uint32_t DepSetBuilder::getBucketStart(RegName regname) const {
uint32_t bucket = 0;
switch (regname) {
case iga::RegName::GRF_R:
bucket = getGRF_START() / getBYTES_PER_BUCKET();
break;
case iga::RegName::ARF_A:
bucket = getARF_A_START() / getBYTES_PER_BUCKET();
break;
case iga::RegName::ARF_ACC:
bucket = getARF_ACC_START() / getBYTES_PER_BUCKET();
break;
case iga::RegName::ARF_F:
bucket = getARF_F_START() / getBYTES_PER_BUCKET();
break;
case RegName::ARF_CR:
case RegName::ARF_SR:
bucket = getARF_SPECIAL_START() / getBYTES_PER_BUCKET();
break;
default:
// putting rest of archtecture registers in to same bucket
bucket = getARF_F_START() / 32;
break;
}
return bucket;
}
size_t DepSetBuilder::DpasMacroBuilder::getNumberOfSuppresionGroups(
uint32_t srcIdx) const {
if (srcIdx == 1) {
return 1;
}
if (srcIdx == 2) {
if (m_model.platform >= Platform::XE_HPC)
return 4;
}
return 0;
}
size_t DepSetBuilder::DpasMacroBuilder::formSrcSuppressionBlock(
InstListIterator startIt, uint32_t srcIdx) {
// get the candidate block
BitSet<> allDstBits(m_dsBuilder.getGRF_LEN());
BitSet<> allSrcBits(m_dsBuilder.getGRF_LEN());
BitSet<> allDstNoLastBits(m_dsBuilder.getGRF_LEN());
BitSet<> allSrcNoLastBits(m_dsBuilder.getGRF_LEN());
SuppressBlockPtrTy bptr =
getSuppressionBlockCandidate(startIt, srcIdx, allDstBits, allSrcBits,
allDstNoLastBits, allSrcNoLastBits);
if (!bptr)
return 0;
size_t numSuppressed = 0;
InstListIterator it = startIt;
// advance inst iterator to the next instruction following the block
// Note that this instruction must be a macro candidate, otherwise the
// suppression block won't formed
std::advance(it, bptr->size());
assert(it != m_instList.end());
// find until the last instruction that can be suppressed
while (it != m_instList.end()) {
if (!srcIsSuppressCandidate(**it, srcIdx))
break;
SrcRegRangeType src_range, src_extra_range;
DstRegRangeType dst_range;
m_inps.getDpasSrcDependency(**it, src_range, src_extra_range, m_model);
m_inps.getDpasDstDependency(**it, dst_range);
if (hasInternalDep(dst_range, src_range,
GetDpasSystolicDepth((*it)->getDpasFc()) == 8))
break;
Operand &srcOp = (*it)->getSource(srcIdx);
// TODO: to simplify the implementation, stop looking if the src is null
if (srcOp.getDirRegName() != RegName::GRF_R)
break;
// found the first instruction that can't be suppressed. Stop looking.
if (!bptr->contains(srcOp.getDirRegRef().regNum))
break;
bool skipSetLastBits = false;
if (hasProducerConsumerDep(dst_range, src_range, allDstBits)) {
break;
}
// at this point, we can add this DPAS into the macro
++numSuppressed;
bptr->addRegRanges(src_range, src_extra_range, dst_range);
if (!skipSetLastBits) {
allSrcNoLastBits = allSrcBits;
allDstNoLastBits = allDstBits;
}
setDstSrcBits(src_range, dst_range, allSrcBits, allDstBits);
InstListIterator nextIt = std::next(it, 1);
if (nextIt == m_instList.end())
break;
if (nextIsNotMacroCandidate(**it, **nextIt))
break;
it = nextIt;
}
if (numSuppressed) {
// at least one instruction can be suppressed, the candidate block can be in
// the macro udpate register footprint into DepSet
updateRegFootprintsToDepSets(bptr->allSrcRange, bptr->allExtraSrcRange,
bptr->allDstRange);
// return the total instructions found can be in the macro
return bptr->size() + numSuppressed;
}
return 0;
}
DepSetBuilder::DpasMacroBuilder::SuppressBlockPtrTy
DepSetBuilder::DpasMacroBuilder::getSuppressionBlockCandidate(
InstListIterator startIt, uint32_t srcIdx, BitSet<> &allDstBits,
BitSet<> &allSrcBits, BitSet<> &allDstNoLastBits,
BitSet<> &allSrcNoLastBits, int forceGroupNum) const {
assert(srcIdx == 1 || srcIdx == 2);
size_t maxGroupNum =
forceGroupNum < 0 ? getNumberOfSuppresionGroups(srcIdx) : forceGroupNum;
// return null if the given src can't be suppressed
if (!maxGroupNum)
return nullptr;
SuppressBlockPtrTy sb(new SuppressBlock(maxGroupNum, srcIdx == 1 ? 8 : 4));
// try from the startIt to see if there are dpas sequence that can form the
// suppression block check number of maxGroupSize to find the first one block
// those can potentially be suppressed
InstListIterator it = startIt;
for (size_t i = 0; i < maxGroupNum; ++i) {
InstListIterator nextIt = it;
++nextIt;
// if next instruction is not a suppression candidate, there's no chance to
// form a suppression block, return nullptr directly
if (nextIt == m_instList.end())
return nullptr;
if (nextIsNotMacroCandidate(**it, **nextIt))
return nullptr;
if (!srcIsSuppressCandidate(**it, srcIdx))
return nullptr;
SrcRegRangeType src_range, src_extra_range;
DstRegRangeType dst_range;
m_inps.getDpasSrcDependency(**it, src_range, src_extra_range, m_model);
m_inps.getDpasDstDependency(**it, dst_range);
if (hasInternalDep(dst_range, src_range,
GetDpasSystolicDepth((*it)->getDpasFc()) == 8))
return nullptr;
bool skipSetLastBits = false;
if (hasProducerConsumerDep(dst_range, src_range, allDstBits)) {
return nullptr;
}
uint16_t reg = (*it)->getSource(srcIdx).getDirRegRef().regNum;
if (sb->partialOverlapped(reg))
return nullptr;
// found the first duplicated register, the block is formed
if (sb->contains(reg))
break;
sb->addRegs(reg);
sb->addRegRanges(src_range, src_extra_range, dst_range);
if (!skipSetLastBits) {
allSrcNoLastBits = allSrcBits;
allDstNoLastBits = allDstBits;
}
setDstSrcBits(src_range, dst_range, allSrcBits, allDstBits);
++it;
}
assert(sb->size());
return std::move(sb);
}
bool DepSetBuilder::DpasMacroBuilder::srcIsSuppressCandidate(
const Instruction &inst, uint32_t srcIdx) const {
// src1 always can be the candidate since all dpas depth must be the same
// within the same macro
if (srcIdx == 1)
return true;
if (srcIdx == 2) {
// DP dpas must have rep count 4
if (inst.isDF())
return GetDpasRepeatCount(inst.getDpasFc()) == 4;
// allow only rep count 8 for non-DP dpase
return GetDpasRepeatCount(inst.getDpasFc()) == 8;
}
return false;
}
bool DepSetBuilder::DpasMacroBuilder::hasProducerConsumerDep(
const DstRegRangeType &dst_range, const SrcRegRangeType &src_range,
const BitSet<> &target_dst_bits) const {
BitSet<> new_srcbits(m_dsBuilder.getGRF_LEN());
BitSet<> new_dstbits(m_dsBuilder.getGRF_LEN());
setDstSrcBits(src_range, dst_range, new_srcbits, new_dstbits);
// check if there is RAW dependency
if (target_dst_bits.intersects(new_srcbits))
return true;
return false;
}
// add given src and dst register ranges into DepSet
void DepSetBuilder::DpasMacroBuilder::updateRegFootprintsToDepSets(
SrcRegRangeType &src_range, SrcRegRangeType &extra_src_range,
DstRegRangeType &dst_range) {
m_inps.addDependency(src_range);
m_inps.addDependency(extra_src_range);
m_oups.addDependency(dst_range);
};
void DepSetBuilder::DpasMacroBuilder::updateRegFootprintsToDepSets(
RegRangeListType &src_range, RegRangeListType &extra_src_range,
RegRangeListType &dst_range) {
m_inps.addDependency(src_range);
m_inps.addDependency(extra_src_range);
m_oups.addDependency(dst_range);
};
const Instruction &DepSetBuilder::DpasMacroBuilder::formMacro(size_t &dpasCnt) {
dpasCnt = 1;
InstListIterator cur = m_firstDpasIt;
SrcRegRangeType src_range, src_extra_range;
DstRegRangeType dst_range;
m_inps.getDpasSrcDependency(**cur, src_range, src_extra_range, m_model);
m_inps.getDpasDstDependency(**cur, dst_range);
InstListIterator next = cur;
next++;
// early exit if there is no following instructions or dpas depth is not 8
if (next == m_instList.end() ||
GetDpasSystolicDepth((*cur)->getDpasFc()) != 8) {
updateRegFootprintsToDepSets(src_range, src_extra_range, dst_range);
return **cur;
}
dpasCnt = std::max(dpasCnt, formSrcSuppressionBlock(m_firstDpasIt, 1));
dpasCnt = std::max(dpasCnt, formSrcSuppressionBlock(m_firstDpasIt, 2));
if (dpasCnt == 1) {
updateRegFootprintsToDepSets(src_range, src_extra_range, dst_range);
return **cur;
}
// Set Atomic to all dpas in the macro except the last one
// Also clean-up their SWSB if set from the input.
InstListIterator it = m_firstDpasIt;
for (size_t i = 0; i < dpasCnt - 1; ++it, ++i) {
(*it)->addInstOpt(InstOpt::ATOMIC);
(*it)->setSWSB(SWSB());
}
InstListIterator last = m_firstDpasIt;
std::advance(last, dpasCnt - 1);
assert(last != m_instList.end());
return **last;
}
bool DepSetBuilder::DpasMacroBuilder::nextIsNotMacroCandidate(
const Instruction &dpas, const Instruction &next_inst) const {
if (!next_inst.getOpSpec().isDpasFormat())
return true;
// DPAS and DPASW should not be in the same macro
if (next_inst.getOp() != dpas.getOp())
return true;
// DPAS with different CtrlMask (NoMask and no NoMask) cannot be in the same
// macro
if (next_inst.getMaskCtrl() != dpas.getMaskCtrl())
return true;
// DPAS with different execution mask cannot be in the same macro
if (next_inst.getChannelOffset() != dpas.getChannelOffset())
return true;
// dpas with different depth should not be in the same macro
// Note that different repeat count is allowed in the same macro
uint32_t dpasSystolicDepth = GetDpasSystolicDepth(dpas.getDpasFc());
uint32_t nextSystolicDepth = GetDpasSystolicDepth(next_inst.getDpasFc());
if (dpasSystolicDepth != nextSystolicDepth)
return true;
// dpas in the same macro must have the same datatypes in all src and dst
assert(dpas.getSourceCount() == next_inst.getSourceCount());
for (size_t i = 0; i < dpas.getSourceCount(); ++i) {
if (dpas.getSource(i).getType() != next_inst.getSource(i).getType())
return true;
}
if (dpas.getDestination().getType() != next_inst.getDestination().getType())
return true;
return false;
}
// set register range from start_reg to upper_reg into bit_set
void DepSetBuilder::DpasMacroBuilder::setBits(BitSet<> &bit_set,
uint32_t start_reg,
uint32_t upper_reg) const {
// if the given register is max(), which means there's no register,
// then no need to do anything
if (start_reg == std::numeric_limits<uint32_t>::max() ||
upper_reg == std::numeric_limits<uint32_t>::max())
return;
for (uint32_t i = start_reg; i <= upper_reg; ++i) {
uint32_t grf_addr = i * m_dsBuilder.getGRF_BYTES_PER_REG();
bit_set.set(grf_addr, m_dsBuilder.getGRF_BYTES_PER_REG());
}
}
// set dst_range to dst_bits and src_range to src_bits
void DepSetBuilder::DpasMacroBuilder::setDstSrcBits(
const SrcRegRangeType &src_range, const DstRegRangeType &dst_range,
BitSet<> &src_bits, BitSet<> &dst_bits) const {
for (const auto ®s : src_range) {
setBits(src_bits, regs.first, regs.second);
}
setBits(dst_bits, dst_range.first, dst_range.second);
}
// check if the given register ranges having intersection
bool DepSetBuilder::DpasMacroBuilder::hasIntersect(
const DepSet::RegRangeType &rr1, const DepSet::RegRangeType &rr2) const {
BitSet<> rr1bits(m_dsBuilder.getGRF_LEN());
BitSet<> rr2bits(m_dsBuilder.getGRF_LEN());
setBits(rr1bits, rr1.first, rr1.second);
setBits(rr2bits, rr2.first, rr2.second);
return rr1bits.intersects(rr2bits);
}
// If rr1 and rr2 footprint are all the same, return true.
// If rr1 and rr2 has intersect but not entirely the same, then return
// false. If no dependency, return true
bool DepSetBuilder::DpasMacroBuilder::hasEntireOverlapOrNoOverlap(
const DepSet::RegRangeType &rr1, const DepSet::RegRangeType &rr2) const {
// no overlap
if (!hasIntersect(rr1, rr2))
return true;
// overlap, check if it's completely overlap
BitSet<> rr1bits(m_dsBuilder.getGRF_LEN());
BitSet<> rr2bits(m_dsBuilder.getGRF_LEN());
setBits(rr1bits, rr1.first, rr1.second);
setBits(rr2bits, rr2.first, rr2.second);
return rr1bits.equal(rr2bits);
}
// check if the instruction having internal dependency
// Instruction having internal dependency on dst to src is not allowed to bein a
// macro. Only for depth 8 dpas, internal dep on dst and src0 is allowed, but
// only when src0 and dst memory footprint is entirely the same
bool DepSetBuilder::DpasMacroBuilder::hasInternalDep(
const DstRegRangeType &dst_range, const SrcRegRangeType &src_range,
bool isDepth8) const {
if (hasIntersect(dst_range, src_range[1]))
return true;
if (hasIntersect(dst_range, src_range[2]))
return true;
// if src0 is null, skip it
if (src_range[0].first != std::numeric_limits<uint32_t>::max()) {
if (!isDepth8 && hasIntersect(dst_range, src_range[0]))
return true;
// for depth 8 dpas, sr0 and dst having the same footprint is
// treated as no internal dependency for other rep_count, having
// intersect is internal dependency
if (isDepth8 && !hasEntireOverlapOrNoOverlap(dst_range, src_range[0]))
return true;
}
return false;
}
std::pair<DepSet *, DepSet *> DepSetBuilder::createDPASSrcDstDepSet(
const InstList &insList, InstListIterator instIt,
const InstIDs &inst_id_counter, size_t &dpasCnt,
SWSB_ENCODE_MODE enc_mode) {
// create DepSet for input
DepSet *inps = new DepSet(inst_id_counter, *this);
mAllDepSet.push_back(inps);
inps->setDepType(DEP_TYPE::READ);
setDEPPipeClass(enc_mode, *inps, **instIt, mPlatformModel);
// create DepSet for output
DepSet *oups = new DepSet(inst_id_counter, *this);
mAllDepSet.push_back(oups);
oups->setDepType(DEP_TYPE::WRITE);
setDEPPipeClass(enc_mode, *oups, **instIt, mPlatformModel);
// identify dpas macro
DpasMacroBuilder dmb(*this, mPlatformModel, insList, instIt, *inps, *oups);
const Instruction &lastDpas = dmb.formMacro(dpasCnt);
// let the last instruciton in the macro represent this DepSet
inps->m_instruction = &lastDpas;
oups->m_instruction = &lastDpas;
return std::make_pair(inps, oups);
}
DepSet *DepSetBuilder::createSrcDepSet(const Instruction &i,
const InstIDs &inst_id_counter,
SWSB_ENCODE_MODE enc_mode) {
DepSet *inps = new DepSet(inst_id_counter, *this);
mAllDepSet.push_back(inps);
inps->m_instruction = &i;
inps->setDepType(DEP_TYPE::READ);
setDEPPipeClass(enc_mode, *inps, i, mPlatformModel);
inps->setInputsFlagDep();
if (i.getOpSpec().isAnySendFormat())
inps->setInputsSendDescDep();
inps->setInputsSrcDep();
return inps;
}
void DepSet::addGrf(size_t reg) {
addGrfBytes(reg, 0, m_DB.getGRF_BYTES_PER_REG());
}
void DepSet::setInputsFlagDep() {
// does it read the flag register
// predication does this
// conditional modifier on 'sel' does this
const Predication &pred = m_instruction->getPredication();
const FlagModifier fm = m_instruction->getFlagModifier();
bool readsFlagRegister =
pred.function != PredCtrl::NONE ||
m_instruction->getOp() == Op::SEL && fm != FlagModifier::NONE;
if (readsFlagRegister) {
// add the ARF offset from ExecMaskOffset
// E.g.
// (f1.0) op (16|M16) ...
// is touching f1.1
const RegRef &fr = m_instruction->getFlagReg();
size_t fByteOff =
(size_t)fr.regNum * m_DB.getARF_F_BYTES_PER_REG() +
(size_t)fr.subRegNum * 2; // FIXME: magic number (needs some thought
// should be bytes per subreg)
size_t execOff =
4 * (static_cast<size_t>(m_instruction->getChannelOffset()));
fByteOff += execOff / 8; // move over by ARF offset
size_t execSize = static_cast<size_t>(m_instruction->getExecSize());
size_t addr = (size_t)m_DB.getARF_F_START() + fByteOff;
addFBytes(addr, execSize);
m_bucketList.push_back(addr / m_DB.getBYTES_PER_BUCKET());
}
}
void DepSet::setInputsSendDescDep() {
IGA_ASSERT(m_instruction->getOpSpec().isAnySendFormat(),
"DepSet::setInputsSendDescDep: must be send format");
// set up reg footprint for desc and exdesc if they are registers
auto desc = m_instruction->getMsgDescriptor();
if (desc.isReg()) {
addABytesAndBukets(desc.reg.regNum);
}
auto exDesc = m_instruction->getExtMsgDescriptor();
if (exDesc.isReg()) {
addABytesAndBukets(exDesc.reg.regNum);
}
}
void DepSet::setInputsSrcDep() {
uint32_t execSize = static_cast<uint32_t>(m_instruction->getExecSize());
// mac/mach has implicitly read to acc0
if (m_instruction->getOp() == Op::MAC || m_instruction->getOp() == Op::MACH ||
m_instruction->getOp() == Op::MACL) {
setSrcRegion(RegName::ARF_ACC, RegRef(0, 0), Region::SRC110, execSize,
16); // assume it's :w, though for acc access it actually does
// not matter, because its footprint will always count as
// acc0/acc1 pair
}
// For the instruction having no srcs, we still need to add ARF_CR to mark the
// dependency This is for the case that:
// and (1|M0) cr0.0<1>:ud cr0.0<0;1,0>:ud 0xFFFFFFCF:ud
// {A@1} nop {A@1}
// It's requested that the instruction following the one having architecture
// register (CR/CE/SR) access, It must mark to sync with all pipes, even if
// it's a nop. nop has no src and dst so we mark it here to force setting swsb
// if required
//