-
Notifications
You must be signed in to change notification settings - Fork 155
/
SWSBSetter.cpp
1363 lines (1246 loc) · 60.2 KB
/
SWSBSetter.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "RegDeps.hpp"
#include "Traversals.hpp"
#include "BitSet.hpp"
#include <iterator>
#include <limits>
using namespace iga;
/**
* RAW: R kill W R-->live explict dependence
* WAW: different pipelines W2 kill W1 W2-->live explict dependence
* WAR: different pipelines W kill R W-->live explict dependence
* WAR: same pipeline W kill R W-->live implict dependence
* AR: sample pipeline R2 kill R1 R2-->live no dependence
* RAR: different pipelines R1,R2-->live no dependence
*
* Different pipeline
* send, math, control flow, long/short (type df)
*
* add (8) r10 r20 r30
* add (8) r11 r21 r22
* if (..)
* // if instruction doesn't count in if calculations, but it takes about 6
* // cycles to resolve for fall through can treat it as continue BB.
* // Only when this BB has one predecessor
* add r40 r10 r50 {@2}
* else
* add r60 r70 r80
* endif
* //Both control flows converge on this. Conservative analysis start with 1.
* //By the time jmp happens counter should be at 0 anyway.
* add r90 r100 r110 {@1}
* add r91 r101 r111 {@2}
*
*
* Types of Dependencies
* dst src0 src1
* grf ind grf // set distance to 1. If SBID list not empty insert test instruction. Optimization if SBID == 1 AND grf depends on it, set SBID, clear SBIDList
*/
/**
Bucket - represents a GRF line
ID - sequential numbering of instructions. Resets at 0 with new BB
Implicit assumption. Various data structures have pointers to Dependencies.
For each BB scan instructions down
if new BB
reset buckets //can use bit mask
reset distanceTracker
reset ID
For each instruction
Calculate dependcy on srcs and destination
if currDistance < MAX_DIST
record distance = currDistance // for a case where we at new BB
For each dependency and bucket it touches look in to a bucket
if bucket not empty
find potential dependencies //bit mask intersects
for each dependency found
if appopriate (WAW, RAR, RAW) Dependence exists
Clear dependency bits from bucket dependency
if dep empty remove from bucket
if DistanceDependency //no out of order
if instDistance > (currDistance - depID)
//We found dependence closer
record distance = currDistance - depID //CurrDistance > depID AND min(currDist - depID, 1)
else //sbid
record SBID ID
if dependencyRecord NOT empty
Generate appropriate SWSB/test instruction
IF SBID
if all dependencies are clear
add SBID to free list
remove entry SBID -> dependencies
Remove MAX_DIST DEP from buckets
Add current instruction Dependencies to buckets
if instruction isVariableExecTime //send, math.idiv
if freeSBITLIst IS empty
pick one SBID
generate test instruction
move SBID to free list
clear dependency from bucket/sbidList
assign SBID from free list
if end of block AND SBID list NOT empty
generate test instructions
*/
#include "SWSBSetter.hpp"
/*
WAW
explicit dependence
math.fc (except idiv) r10 ...
add r10 ....
add r10 ... //long: type DF/Q
add r10 ... //short:
WAW
no dependence
add r10 ...
add r10 ...
Math.sin r10 r20 r30
Math.cos r20 r40 r50
Not required - same pipe
Math.sin r20 r10 r30
Math.cos r20 r40 r50
Not required - same pipe
FPU_long r20 r10 r30
Math.sin r20 r40 r50
Explicit dep required as math can overtake FPU_long - since they are in different pipes.
RAW
add r10 ...
add r20 ...
add ... r20 ... {@1}
add ... r10 {@3} <--- technically speaking this depending is not necesary
since they are in same pipe and previous instruction will stall
so last instruction dependence is cleared.
But in terms of runtime there is no impact so not worth special handling
assuming two grfs are written/read
send r10
send r11
add (16) ... r10 ...
second send has dependency on first send
add has dependency on second send
if sends written 1 grf, and add still read two grfs it will have dependence on both sends
send r10 //set$1 writes r10/r11
add(8) r10 {$1.dst}
add(8) r11 {}
*/
void SWSBAnalyzer::clearDepBuckets(DepSet &depMatch)
{
for (auto bucketID : depMatch.getBuckets())
{
auto bucket = &m_buckets[bucketID];
auto numDepSets = bucket->getNumDependencies();
for (uint32_t i = 0; i < numDepSets; ++i)
{
DepSet* dep = bucket->getDepSet(i);
//See if anything matches for this GRF bucket.
//originally was checking for intersect but was removing extra dependence in case like this
/*
(W) and (1|M0) r0.0<1>:ud r0.0<0;1,0>:ud 0xFFFFBFFF:ud {}
(W) mov (16|M0) r25.0<1>:f 0:d
(W) mov (1|M0) r0.2<1>:ud 0x0:ud
(W) mov (1|M0) r0.2<1>:ud 0x0:ud
(W) and (1|M0) r0.0<1>:ud r0.0<0;1,0>:ud 0xFFFFF7FF:ud {}
mov (16|M0) r120.0<1>:ud r17.0<8;8,1>:ud
mov (16|M0) r122.0<1>:ud r19.0<8;8,1>:ud
mov (16|M0) r124.0<1>:ud r21.0<8;8,1>:ud
mov (16|M0) r126.0<1>:ud r23.0<8;8,1>:ud
(W) mov (16|M0) r118.0<1>:ud r0.0<8;8,1>:ud {}
*/
//the r0 dependece was already cleared by second r0
//but when clearing from buckets it would find the second r0 and clear it by mistake
if (dep && depMatch.getInstGlobalID() == dep->getInstGlobalID() &&
(dep->getDepType() == depMatch.getDepType()))
{
bucket->clearDepSet(i);
}
}
}
depMatch.reset();
}
/**
* This function takes in a current instruction dependency.
* Either SRC or DST
* It then checks against previous dependencies.
* It sets mininum valid distance
* and creates an active list of SBIDs this instruction depends on
* It clears and removes previous dependencies.
* The approach is bucket based.
* Each bucket is one GRF.
* So if instruction writes in to more then one GRF then multiple buckets will have the dependency
*/
void SWSBAnalyzer::calculateDependence(DepSet &currDep, SWSB &distanceDependency,
const Instruction &currInst, std::vector<SBID>& activeSBID, bool &needSyncForShootDownInst)
{
needSyncForShootDownInst = false;
auto currDepType = currDep.getDepType();
auto currDepPipe = currDep.getDepPipe();
for (auto bucketID : currDep.getBuckets())
{
//iterates over Dependencies in a GRF bucket
//Assumption there shouldn't be more then 1-2
Bucket* bucket = &m_buckets[bucketID];
size_t numDepSets = bucket->getNumDependencies();
for (uint32_t i = 0; i < numDepSets; ++i)
{
uint32_t index = static_cast<uint32_t>(numDepSets -1 - i);
auto dep = bucket->getDepSet(index);
if (dep && (dep->getDepType() == DEP_TYPE::WRITE_ALWAYS_INTERFERE ||
dep->getDepType() == DEP_TYPE::READ_ALWAYS_INTERFERE))
{
// force to sync with dep
if (dep->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
{
setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
}
else
{
// Set to sync with all in-order-pipes. WRITE/READ_ALWAYS_INTERFERE
// could be used to mark arf dependency, which is required to be all pipes
// instead of dep's pipe only
distanceDependency.minDist = 1;
if (getNumOfDistPipe() == 1)
distanceDependency.distType = SWSB::DistType::REG_DIST;
else
distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
bucket->clearDepSet(index);
}
}
//See if anything matches for this GRF bucket.
if (dep && dep->getBitSet().intersects(currDep.getBitSet()))
{
/*
* RAW: R kill W R-->live explict dependence
* WAW: different pipelines W2 kill W1 W2-->live explict dependence
* WAR: different pipelines W kill R W-->live explict dependence
* WAR: same pipeline W kill R W-->live implict dependence
* AR: sample pipeline R2 kill R1 R2-->live no dependence
* RAR: different pipelines R1,R2-->live no dependence
*/
//RAW: R kill W R-->live explict dependence
DEP_TYPE prevDepType = dep->getDepType();
DEP_PIPE prevDepPipe = dep->getDepPipe();
DEP_CLASS prevDepClass = dep->getDepClass();
// Send with different SFID could write to different pipes
bool sendInDiffPipe = false;
if (dep->getInstruction()->getOpSpec().isSendFamily() &&
currDep.getInstruction()->getOpSpec().isSendFamily())
{
sendInDiffPipe =
(dep->getInstruction()->getSendFc() !=
currDep.getInstruction()->getSendFc());
// for send in unknown pipe, always treated as different pipe
if (!sendInDiffPipe) {
sendInDiffPipe =
dep->getDepPipe() == DEP_PIPE::SEND_UNKNOWN ||
currDep.getDepPipe() == DEP_PIPE::SEND_UNKNOWN;
}
}
bool isRAW = currDepType == DEP_TYPE::READ &&
prevDepType == DEP_TYPE::WRITE;
//WAW: different pipelines W2 kill W1 W2-->live explict dependence
bool isWAW = (currDepType == DEP_TYPE::WRITE &&
prevDepType == DEP_TYPE::WRITE &&
(currDepPipe != prevDepPipe || sendInDiffPipe));
//WAR: different pipelines W kill R W-->live explict dependence
bool isWAR = currDepType == DEP_TYPE::WRITE &&
prevDepType == DEP_TYPE::READ &&
(currDepPipe != prevDepPipe || sendInDiffPipe);
bool isWAW_out_of_order
= (currDepType == DEP_TYPE::WRITE &&
prevDepType == DEP_TYPE::WRITE &&
prevDepClass == DEP_CLASS::OUT_OF_ORDER);
// Special case handling for acc/flag dependency:
// if the RAW dependency on acc and it's whithin the same pipe,
// HW can handle it that we don't need to set swsb
if (isRAW && currDepPipe == prevDepPipe) {
auto check_dep_reg = [&](DepSet* in_dep, uint32_t reg_start, uint32_t reg_len) {
return in_dep->getBitSet().intersects(currDep.getBitSet(),
reg_start, reg_len);
};
auto has_grf_dep = [&](DepSet* in_dep) {
return check_dep_reg(in_dep, m_DB->getGRF_START(), m_DB->getGRF_LEN());
};
auto has_arf_a_dep = [&](DepSet* in_dep) {
return check_dep_reg(in_dep, m_DB->getARF_A_START(), m_DB->getARF_A_LEN());
};
auto has_acc_dep = [&](DepSet* in_dep) {
return check_dep_reg(in_dep, m_DB->getARF_ACC_START(), m_DB->getARF_ACC_LEN());
};
auto has_flag_dep = [&](DepSet* in_dep) {
return check_dep_reg(in_dep, m_DB->getARF_F_START(), m_DB->getARF_F_LEN());
};
auto has_sp_dep = [&](DepSet* in_dep) {
return check_dep_reg(in_dep, m_DB->getARF_SPECIAL_START(), m_DB->getARF_SPECIAL_LEN());
};
// is acc dependecy
if (has_acc_dep(dep)) {
// and no dependency on other registers
if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_flag_dep(dep) || has_sp_dep(dep)))
isRAW = false;
}
// is flag dependency
if (has_flag_dep(dep)) {
// and no dependency on other registers
if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_acc_dep(dep) || has_sp_dep(dep)))
isRAW = false;
// flag and acc only
if (has_acc_dep(dep))
if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_sp_dep(dep)))
isRAW = false;
}
}
if (isWAR ||
isWAW ||
isRAW ||
isWAW_out_of_order)
{
// clearing previous dependence
if (dep->getBitSet().empty())
{
m_errorHandler.reportWarning(
currInst.getPC(),
"Dependency in bucket with no bits set");
}
// removing from bucket if there is nothing
if (!dep->getBitSet().testAny(bucketID * 32, m_DB->getGRF_BYTES_PER_REG()))
{
bucket->clearDepSet(index);
}
if (prevDepClass == DEP_CLASS::IN_ORDER)
{
if (getNumOfDistPipe() == 1) {
// FOR WAW if PREV is SHORT and curr is LONG then write will finish
// before current write, no need to set swsb
bool isWAWHazard = (prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::LONG ||
prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::SHORT)
&& isWAW;
// require swsb for all the other kinds of dependency
if (!isWAWHazard)
{
// setting minimum distance
uint32_t newDistance = m_InstIdCounter.inOrder - dep->getInstIDs().inOrder;
distanceDependency.minDist =
distanceDependency.minDist == 0 ?
newDistance :
std::min(distanceDependency.minDist, newDistance);
// clamp the distance to max distance
distanceDependency.minDist = std::min(distanceDependency.minDist, (uint32_t)MAX_VALID_DISTANCE);
distanceDependency.distType = SWSB::DistType::REG_DIST;
}
} else {
// For multiple in-order pipeline architecuture, all cases should be considered
// The distance is depended on the previous instruction's pipeline
uint32_t newDistance = 0;
SWSB::DistType newDepPipe = SWSB::DistType::NO_DIST;
switch (prevDepPipe) {
case DEP_PIPE::FLOAT:
newDistance = m_InstIdCounter.floatPipe - dep->getInstIDs().floatPipe;
newDepPipe = SWSB::DistType::REG_DIST_FLOAT;
break;
case DEP_PIPE::INTEGER:
newDistance = m_InstIdCounter.intPipe - dep->getInstIDs().intPipe;
newDepPipe = SWSB::DistType::REG_DIST_INT;
break;
case DEP_PIPE::LONG64:
newDistance = m_InstIdCounter.longPipe - dep->getInstIDs().longPipe;
newDepPipe = SWSB::DistType::REG_DIST_LONG;
break;
case DEP_PIPE::MATH_INORDER:
newDistance = m_InstIdCounter.mathPipe - dep->getInstIDs().mathPipe;
newDepPipe = SWSB::DistType::REG_DIST_MATH;
break;
default:
IGA_ASSERT(0, "Unsupported DEP_PIPE for in-order instructions");
break;
}
// the instruction already has dependency to others
if (distanceDependency.minDist) {
newDistance = std::min(distanceDependency.minDist, newDistance);
// if the type is REG_DIST_ALL or is the same with the new pipe type,
// then remains it. Otherwise update the swsb type
if ((distanceDependency.distType != newDepPipe) && (distanceDependency.distType != SWSB::DistType::REG_DIST_ALL)) {
// get the pipe_type from opnd type
auto op_pipe_type = [](Type op_type) {
if (TypeIs64b(op_type))
return SWSB::DistType::REG_DIST_LONG;
if (TypeIsFloating(op_type))
return SWSB::DistType::REG_DIST_FLOAT;
return SWSB::DistType::REG_DIST_INT;
};
// check if the given pipe type is the same with one of the src type
auto haveTypeInSrc = [&](SWSB::DistType swsb_type) {
// HW restriction (WA): Cannot use @1 on XeHPC-XT, must explicitly set pipe type
// A@1 or L@1, ... Always return false so that we won't use @1
// Note that if there isn't this restriction, we should also update op_pipe_type
// for FourDistPipeReduction mode that non-float-64-bit type should be in INT pipe
if (m_swsbMode == SWSB_ENCODE_MODE::FourDistPipeReduction) {
return false;
}
for (size_t i = 0; i < currInst.getSourceCount(); ++i) {
if (op_pipe_type(currInst.getSource(i).getType()) == swsb_type)
return true;
}
return false;
};
if ((distanceDependency.distType == SWSB::DistType::REG_DIST_MATH) ||
(newDepPipe == SWSB::DistType::REG_DIST_MATH)) {
// either current of prev dep is MATH, it's not possible to combine them to REG_DIST
distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
} else if ((distanceDependency.distType != SWSB::DistType::REG_DIST)) {
// check if both previous and current dep pipe can be satisfied by currInst src type
if (haveTypeInSrc(distanceDependency.distType) && haveTypeInSrc(newDepPipe))
distanceDependency.distType = SWSB::DistType::REG_DIST;
else
distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
} else {
// if previous one is REG_DIST, set the type to REG_DIST_ALL if
// current one cannot be satisfied by src type
if (!haveTypeInSrc(newDepPipe))
distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
}
}
} else {
distanceDependency.distType = newDepPipe;
}
assert(distanceDependency.distType != SWSB::DistType::NO_DIST);
// clamp the distance to max distance
distanceDependency.minDist = std::min(newDistance, (uint32_t)MAX_VALID_DISTANCE);
} // end of if (m_enableMultiDistPipe)
// clear this instruction's dependency since it is satisfied
clearDepBuckets(*dep);
// clear its companion because when an in-order instruction is synced, both its
// input and output dependency are satisfied. The only case is that if it has
// read/write_always_interfere dependency, it should be reserved.
// The restriction is that:
// When certain Arch Registers (sr, cr, ce) are used,
// the very next instruction requires dependency to be set on all pipes {A@1}
// e.g.
// mov (1|M0) r104.0<1>:ud sr0.1<0;1,0>:ud
// cmp(16 | M0) (ne)f0.0 null:ud r104.0<0; 1, 0> : ub r62.4<0; 1, 0> : uw
// A@1 is required for cmp instead of I@1
if (dep->getCompanion() != nullptr) {
// In the case that this DepSet is generated from math_wa_info, it won't have companion
if (dep->getCompanion()->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
dep->getCompanion()->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE) {
clearDepBuckets(*dep->getCompanion());
}
}
} // end of if (prevDepClass == DEP_CLASS::IN_ORDER)
else if (prevDepClass == DEP_CLASS::OUT_OF_ORDER) // prev is out of order
{
setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
}
// for the instruction in "OTHER" DEP_CLASS, such as sync, we don't need
// to consider their dependency that is implied by hardware
}
}
}
}
}
void SWSBAnalyzer::setSbidDependency(DepSet& dep, const Instruction& currInst,
bool& needSyncForShootDownInst, std::vector<SBID>& activeSBID)
{
/* For out of order we don't know how long it will finish
* so need to test for SBID.
* Instruction can depend on more then one SBID
* send r10
* send r20
* send r30
* ....
* add r10 r20 r30
* between different buckets and srcs/dst dependencies instruction can rely on multiple SBID
*/
SBID depSBID = dep.getSBID();
if (depSBID.isFree)
{
m_errorHandler.reportError((int)dep.getInstGlobalID(), "SBID SHOULDN'T BE FREE!");
}
// clears all the buckets
clearDepBuckets(dep);
// In case of shooting down of this instruction, we need to add sync to preserve the swsb id sync,
// so that it's safe to clear the dep
if (currInst.hasPredication() ||
(currInst.getExecSize() != dep.getInstruction()->getExecSize()) ||
(currInst.getChannelOffset() != dep.getInstruction()->getChannelOffset()))
needSyncForShootDownInst = true;
// used to set read or write dependency
depSBID.dType = dep.getDepType();
// activeSBID stores all sbid that this inst has dependency on
// and it'll be processed in processActiveSBID
bool push_back = true;
// making sure there are no duplicates
for (auto& aSBID : activeSBID)
{
if (aSBID.sbid == depSBID.sbid)
{
//write takes longer then read
//so we only need to check on one.
//so this either sets a write or resets back to read
if (aSBID.dType == DEP_TYPE::READ)
{
aSBID.dType = depSBID.dType;
}
push_back = false;
break;
}
}
// adding to active SBID
// in Run function we will see how many this instruction relies on
// and generate approriate SWSB and if needed test instruction
// in that level also will add them back to free list
if (push_back)
{
activeSBID.push_back(depSBID);
}
}
void SWSBAnalyzer::insertSyncAllRdWr(InstList::iterator insertPoint, Block *bb)
{
SWSB distanceDependency;
auto clearRD = m_kernel.createSyncAllRdInstruction(distanceDependency);
auto clearWR = m_kernel.createSyncAllWrInstruction(distanceDependency);
if (insertPoint == bb->getInstList().end())
{
bb->getInstList().push_back(clearRD);
bb->getInstList().push_back(clearWR);
}
else
{
bb->insertInstBefore(insertPoint, clearRD);
bb->insertInstBefore(insertPoint, clearWR);
}
}
//TODO this should also clear up grf dependency to handle this case:
/*
call (16|M0) r8.0:ud 32
sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
(W) mov (1|M0) a0.0<1>:ud r7.0<0;1,0>:ud
sendc.rc (16|M0) null r100 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
(W) mov (16|M0) r118.0<1>:ud r6.0<8;8,1>:ud
(W) send.dc0 (16|M0) r38 r118 null 0x0 a0.0
ret (16|M0)
Right now mov will have false dependense on the first send.
*/
void SWSBAnalyzer::clearSBIDDependence(InstList::iterator insertPoint, Instruction *lastInst, Block *bb)
{
bool sbidInUse = false;
for (uint32_t i = 0; i < m_SBIDCount; ++i)
{
//there are still dependencies that might be used outside of this basic block
if (!m_freeSBIDList[i].isFree)
{
sbidInUse = true;
}
m_freeSBIDList[i].reset();
}
// if last instruction in basic block is EOT no need to generate flushes
// hardware will take care of it
if (lastInst && lastInst->getOpSpec().isSendFamily() && lastInst->hasInstOpt(InstOpt::EOT))
{
sbidInUse = false;
}
// platform check is mainly for testing purposes
if (sbidInUse)
{
insertSyncAllRdWr(insertPoint, bb);
}
}
// Keeping track of dependencies that need to be cleared because they are no longer relevant
// right now each BB ends with control flow instruction, and we reset at each BB
void SWSBAnalyzer::clearBuckets(DepSet* input, DepSet* output) {
if (input->getDepClass() != DEP_CLASS::IN_ORDER)
return;
if (m_initPoint) {
m_distanceTracker.emplace_back(input, output);
m_initPoint = false;
}
else {
// add DepSet to m_distanceTracker
m_distanceTracker.emplace_back(input, output);
auto get_depset_id = [&](DEP_PIPE pipe_type, DepSet& dep_set) {
if (getNumOfDistPipe() == 1)
return dep_set.getInstIDs().inOrder;
switch(pipe_type) {
case DEP_PIPE::FLOAT:
return dep_set.getInstIDs().floatPipe;
case DEP_PIPE::INTEGER:
return dep_set.getInstIDs().intPipe;
case DEP_PIPE::LONG64:
return dep_set.getInstIDs().longPipe;
case DEP_PIPE::MATH_INORDER:
return dep_set.getInstIDs().mathPipe;
default:
IGA_ASSERT(0, "SWSB: unhandled in-order DEP_PIPE for XeHP+ encoding");
break;
}
return (uint32_t)0;
};
auto get_latency = [&](DEP_PIPE pipe_type) {
if (pipe_type == DEP_PIPE::LONG64)
return m_LatencyLong64Pipe;
else if (pipe_type == DEP_PIPE::MATH_INORDER)
return m_LatencyInOrderMath;
return m_LatencyInOrderPipe;
};
DEP_PIPE new_pipe = input->getDepPipe();
// max B2B latency of thie pipe
size_t max_dis = get_latency(new_pipe);
// Remove nodes from the Tracker if the latency is already satified
m_distanceTracker.remove_if(
[=](const distanceTrackerNode& node) {
// bypass nodes those are not belong to the same pipe
if (node.input->getDepPipe() != new_pipe)
return false;
// if the distance >= max_latency, clear buckets for corresponding
// input and output Dependency
size_t new_id = get_depset_id(new_pipe, *input);
if ((new_id - get_depset_id(new_pipe, *node.input)) >= max_dis) {
clearDepBuckets(*node.input);
clearDepBuckets(*node.output);
return true;
}
return false;
}
);
}
}
void SWSBAnalyzer::processActiveSBID(SWSB &distanceDependency, const DepSet* input,
Block *bb, InstList::iterator instIter, std::vector<SBID>& activeSBID)
{
// If instruction depends on one or more SBIDS, first one goes in to SWSB field
// for rest we generate wait instructions.
for (auto aSBID : activeSBID)
{
// Could be we had operation depending on the write
/*
* This case also gets triggered when we have send in BB and dependence in another BB
* L0:
* call (16|M0) r8.0 L64
* L16:
* sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {$0} // wr:10h, rd:0, Render Target Write msc:16, to #0
* L64:
* (W) mov (16|M0) r118.0<1>:ud r6.0<8;8,1>:ud
* (W) send.dc0 (16|M0) r38 r118 null 0x0 a0.0 {@1, $0}
* ret (16|M0) r8.0 {@3}
* After first BB in which sendc.rc ends we clear all SBID and generate sync instructions
* On mov it detects dependense, but all SBID are freed.
*/
if (m_freeSBIDList[aSBID.sbid].isFree)
{
continue;
}
SWSB::TokenType tType = SWSB::TokenType::NOTOKEN;
if (aSBID.dType == DEP_TYPE::READ ||
aSBID.dType == DEP_TYPE::READ_ALWAYS_INTERFERE)
{
tType = SWSB::TokenType::SRC;
}
else
{
tType = SWSB::TokenType::DST;
//if SBID is cleared add it back to free pool
//write is last thing. So if instruction depends on it we know read is done
//but not vice versa
m_freeSBIDList[aSBID.sbid].reset();
// clean up the dependency
assert(m_IdToDepSetMap.find(aSBID.sbid) != m_IdToDepSetMap.end());
assert(m_IdToDepSetMap[aSBID.sbid].first->getDepClass() == DEP_CLASS::OUT_OF_ORDER);
clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].first);
clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].second);
}
// Setting first SBID as part of instruction
// If this instruction depends on more SBID, generate sync for the extra ids
// TODO: Is it safe to clear SBID here?
if (distanceDependency.tokenType == SWSB::TokenType::NOTOKEN)
{
distanceDependency.tokenType = tType;
distanceDependency.sbid = aSBID.sbid;
} else {
// add sync for the id
SWSB sync_swsb(SWSB::DistType::NO_DIST, tType, 0, aSBID.sbid);
auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
bb->insertInstBefore(instIter, nopInst);
}
}
// verify if the combination of token and dist is valid, if not, move the
// token dependency out and add a sync for it
if (!distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode))) {
// add sync for the id
SWSB sync_swsb(SWSB::DistType::NO_DIST, distanceDependency.tokenType, 0,
distanceDependency.sbid);
auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
bb->insertInstBefore(instIter, nopInst);
distanceDependency.tokenType = SWSB::TokenType::NOTOKEN;
distanceDependency.sbid = 0;
}
assert(distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode)));
}
uint32_t SWSBAnalyzer::getNumOfDistPipe()
{
switch(m_swsbMode) {
case SWSB_ENCODE_MODE::SingleDistPipe:
return 1;
case SWSB_ENCODE_MODE::ThreeDistPipe:
return 3;
case SWSB_ENCODE_MODE::FourDistPipe:
case SWSB_ENCODE_MODE::FourDistPipeReduction:
return 4;
default:
break;
}
return 0;
}
void SWSBAnalyzer::advanceInorderInstCounter(DEP_PIPE dep_pipe)
{
++m_InstIdCounter.inOrder;
if (getNumOfDistPipe() == 1)
return;
switch (dep_pipe) {
case DEP_PIPE::FLOAT:
++m_InstIdCounter.floatPipe;
break;
case DEP_PIPE::INTEGER:
++m_InstIdCounter.intPipe;
break;
case DEP_PIPE::LONG64:
++m_InstIdCounter.longPipe;
break;
case DEP_PIPE::MATH_INORDER:
++m_InstIdCounter.mathPipe;
break;
default:
IGA_ASSERT(0, "unhandled in-order DEP_PIPE for XE_HP encoding");
break;
}
}
void SWSBAnalyzer::addRMWDependencyIfReqruied(DepSet& input, DepSet& output) {
const Instruction* inst = input.getInstruction();
// return if the instruction has no dst, or the dst is not GRF or not byte type
const Operand& dst = inst->getDestination();
if (dst.getKind() != Operand::Kind::DIRECT)
return;
if (dst.getDirRegName() != RegName::GRF_R)
return;
if (TypeSizeInBitsWithDefault(dst.getType(), 32) != 8)
return;
// When there is RMW behavior, the instruction will read the Word first,
// modify the byte value in it and then write back the entire Word.
// we assume the instruction will read/write the entire register to simplify
// the logic
// add the entire grf of the dst register into input and output DepSet
// All registers being touched are added into Bucket. We can get the touched grf
// number from added bucket index
const std::vector<size_t>& out_buk = output.getBuckets();
for (auto i : out_buk) {
// we only need grf bucket
if (i >= m_DB->getBucketStart(RegName::ARF_A))
continue;
input.addGrf(i);
input.addToBucket((uint32_t)i);
output.addGrf(i);
}
}
void SWSBAnalyzer::addSWSBToInst(Instruction& inst,
const SWSB& swsb,
Block& block,
InstListIterator inst_it)
{
SWSB new_swsb(inst.getSWSB());
// handling distance
if (swsb.hasDist()) {
if (!inst.getSWSB().hasDist()) {
new_swsb.distType = swsb.distType;
new_swsb.minDist = swsb.minDist;
} else {
// for single dist pipe platform, distType must be REG_DIST, so won't
// be set to REG_DIST_ALL
new_swsb.distType = (inst.getSWSB().distType == swsb.distType)?
swsb.distType : SWSB::DistType::REG_DIST_ALL;
new_swsb.minDist = std::min(inst.getSWSB().minDist, swsb.minDist);
}
}
// handling token
if (swsb.hasToken()) {
if (!inst.getSWSB().hasToken()) {
new_swsb.tokenType = swsb.tokenType;
new_swsb.sbid = swsb.sbid;
} else {
// if both has id, and are different, then insert a sync to carry
// the new one, otherwise do nothing
if ((inst.getSWSB().tokenType != swsb.tokenType) ||
(inst.getSWSB().sbid != swsb.sbid)) {
SWSB tmp_swsb(SWSB::DistType::NO_DIST, swsb.tokenType,
0, swsb.sbid);
Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
block.insertInstBefore(inst_it, sync_inst);
}
}
}
// check if the new swsb combination is valid, if not, move the dist out to a sync
// FIXME: move the dist out here to let the sbid set on the instruction could have better
// readability, but a potential issue is that A@1 is required to be set on the instruction having
// architecture read/write. This case A@1 will be moved out from the instruction
if (!new_swsb.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode))) {
SWSB tmp_swsb(swsb.distType, SWSB::TokenType::NOTOKEN,
swsb.minDist, 0);
Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
block.insertInstBefore(inst_it, sync_inst);
new_swsb.distType = SWSB::DistType::NO_DIST;
new_swsb.minDist = 0;
}
inst.setSWSB(new_swsb);
IGA_ASSERT(inst.getSWSB().verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode)),
"Invalid swsb dist/token combination after merge");
}
static bool isSyncNop(const Instruction &i) {
return i.is(Op::SYNC) && i.getSyncFc() == SyncFC::NOP;
};
void SWSBAnalyzer::postProcess()
{
// revisit all instructions
for (Block* bb : m_kernel.getBlockList())
{
InstList& instList = bb->getInstList();
for (auto inst_it = instList.begin(); inst_it != instList.end(); ++inst_it)
{
Instruction* inst = *inst_it;
// move all swsb set on the second instruction to the first for
// "instruction combined" case on byte type dst. e.g.
// (W) mov (32|M0) r13.0<2>:ub r11.0<1;1,0>:uw {Atomic}
// (W) mov (32|M0) r13.1<2>:ub r10.0<1;1,0>:uw
if (m_kernel.getModel().hasReadModifiedWriteOnByteDst() &&
inst->hasInstOpt(InstOpt::ATOMIC) &&
!inst->getOpSpec().isDpasFamily() &&
!inst->getOpSpec().isSendOrSendsFamily() &&
inst->getDestination().getDirRegName() == RegName::GRF_R &&
TypeSizeInBitsWithDefault(inst->getDestination().getType(), 32) == 8)
{
auto next_it = inst_it;
++next_it;
assert(next_it != instList.end());
Instruction* next_inst = *next_it;
// in case the next instructions have sync carrying its swsb, move
// sync to before current instruction
// - Make sure current inst is not the last inst other than sync
InstList sync_insts;
while (next_inst->is(Op::SYNC)) {
sync_insts.push_back(next_inst);
++next_it;
if (next_it == instList.end())
break;
next_inst = *next_it;
}
if (next_it == instList.end()) {
// An unexpected instruction with {Atomic} set but has no following
// instruction that can be combined with it
assert(next_it != instList.end());
continue;
}
// - move sync to before current inst
if (!sync_insts.empty()) {
auto remove_start = inst_it;
++remove_start;
instList.erase(remove_start, next_it);
instList.insert(inst_it, sync_insts.begin(), sync_insts.end());
}
// the following instruction must not have Atomic set, or we do not
// know what should do
IGA_ASSERT((!next_inst->hasInstOpt(InstOpt::ATOMIC)),
"Atomic followed by Atomic on fixed latency instructions");
SWSB next_swsb = next_inst->getSWSB();
if (next_swsb.hasSWSB()) {
addSWSBToInst(*inst, next_swsb, *bb, inst_it);
next_inst->setSWSB(SWSB());
}
}
}
}
// revisit all instructions to remove redundant sync.nop
// sync.nop carry the sbid the same as the sbid set on the following instruction can be
// removed since it'll automatically be sync-ed when sbid is reused. For example:
// sync.nop null {$0.dst} // can be removed
// math.exp(8|M0) r12.0<1>:f r10.0<8;8,1>:f {$0}
for (Block* bb : m_kernel.getBlockList())
{
InstList& instList = bb->getInstList();
if (instList.empty())
continue;
auto inst_it = instList.begin();
// skip the first instruction, which must not be sync
++inst_it;
for (; inst_it != instList.end(); ++inst_it)
{
Instruction* inst = *inst_it;
if (isSyncNop(*inst))
continue;
SWSB cur_swsb = inst->getSWSB();
if (cur_swsb.hasToken() && (cur_swsb.tokenType == SWSB::TokenType::SET)) {
// iterate through the previous sync
auto sync_it = inst_it;
--sync_it;
while (sync_it != instList.begin()) {
Instruction* sync_inst = *sync_it;
if (!isSyncNop(*sync_inst))
break;
SWSB sync_swsb = sync_inst->getSWSB();
// if the sync has sbid set, it could be the reserved sbid for shoot down
// instructions, we should keep it.
if (sync_swsb.hasToken() && sync_swsb.tokenType != SWSB::TokenType::SET &&
sync_swsb.sbid == cur_swsb.sbid) {
// clean the swsb so that we can remove this instruction later
sync_inst->setSWSB(SWSB());
}
--sync_it;
}
}
}
// remove the redundant sync.nop (sync.nop with no swsb)
instList.remove_if([](const Instruction* inst) {
return isSyncNop(*inst) && !inst->getSWSB().hasSWSB();
});
}
}
SBID& SWSBAnalyzer::assignSBID(DepSet* input, DepSet* output, Instruction& inst, SWSB& distanceDependency,
InstList::iterator insertPoint, Block *curBB, bool needSyncForShootDown)
{
bool foundFree = false;
SBID *sbidFree = nullptr;
for (uint32_t i = 0; i < m_SBIDCount; ++i)
{
if (m_freeSBIDList[i].isFree)
{
foundFree = true;
sbidFree = &m_freeSBIDList[i];
m_freeSBIDList[i].sbid = i;
break;
}
}
// no free SBID.
if (!foundFree)
{
unsigned int index = (m_SBIDRRCounter++) % m_SBIDCount;