-
Notifications
You must be signed in to change notification settings - Fork 91
/
ar.c
1084 lines (1016 loc) · 57.7 KB
/
ar.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (c) 1990-2024, Jsoftware Inc. All rights reserved. */
/* Licensed use only. Any other use is in violation of copyright. */
/* */
/* Adverbs: Reduce (Insert), Outer Product, and Fold */
#include "j.h"
#include "vasm.h"
#include "ve.h"
#include "vcomp.h"
#include "ar.h"
static DF1(jtreduce);
#define PARITYW(s) (s=BW>32?s^s>>32:s, s^=s>>16, s^=s>>8) // parity of a word full of Bs. Upper bits are garbage
// vdo for eq/ne. take parity of *x over n bytes, store into *z; invert if !pc; repeat for m cells
static void vdone(I m,I n,B*x,B*z,B pc){
I nfull=(n-1)>>LGSZI; I ndisc=(-n)&(SZI-1); // number of full Is, and number of bytes to discard from the next I
DQ(m, I s=pc^1; I *y=(I*)x; DQ(nfull, s^=*y++;); s^=(*y<<(ndisc<<3)); PARITYW(s); *z++=s; x+=n;); // parity, discarding trailing bytes of overfetch
}
#if SY_ALIGN
#define RBFXODDSIZE(pfx,bpfx) RBFXLOOP(C,bpfx)
#define REDUCECFX REDUCEBFX
#else
// m is # cells to operate on; n is # items in 1 such cell; d is # atoms in one such item
#define RCFXLOOP(T,pfx) \
{T* RESTRICT xx=(T*)x,* RESTRICT yy,*z0,* RESTRICT zz=(T*)z; \
q=d/sizeof(T); \
for(j=0;j<m;++j){ \
yy=xx; xx+=q; z0=zz; DQ(q, *zz++=pfx(*yy,*xx); ++xx; ++yy;); \
DQ(n-2, zz=z0; DQ(q, *zz++=pfx(*zz,*xx); ++xx; );); \
}} /* commutative */
#define RBFXODDSIZE(pfx,bpfx) \
{B*zz;I r,t,*xi,*yi,*zi; \
q=d>>LGSZI; r=d&(SZI-1); xi=(I*)x; zz=z; \
for(j=0;j<m;++j,zz-=d){ \
yi=xi; xi=(I*)((B*)xi-d); zi=(I*)zz; \
DQ(q, --xi; --yi; *--zi=pfx(*xi,*yi);); \
xi=(I*)((B*)xi-r); yi=(I*)((B*)yi-r); t=pfx(*xi,*yi); MC((B*)zi-r,&t,r); \
DQ(n-2, zi=(I*)zz; DQ(q, --xi; --zi; *zi=pfx(*xi,*zi);); \
xi=(I*)((B*)xi-r); zi=(I*)((B*)zi-r); t=pfx(*xi,*zi); MC( zi, &t,r);); \
}} /* non-commutative */
#define RCFXODDSIZE(pfx,bpfx) \
{I r,t,*xi,*yi,*zi; \
q=d>>LGSZI; r=d&(SZI-1); \
for(j=0;j<m;++j,x+=d,z+=d){ \
yi=(I*)x; x+=d; xi=(I*)x; zi=(I*)z; DQ(q, *zi++=pfx(*yi,*xi); ++xi; ++yi;); t=pfx(*yi,*xi); MC(zi,&t,r); \
DQ(n-2, x+=d; xi=(I*)x; zi=(I*)z; DQ(q, *zi++=pfx(*zi,*xi); ++xi; ); t=pfx(*zi,*xi); MC(zi,&t,r);); \
}} /* commutative */
#define REDUCECFX(f,pfx,ipfx,spfx,bpfx,vdo) \
AHDRP(f,B,B){B*y=0;I j,q; \
if(d==1){vdo; R;} \
if(1==n)DQ(d, *z++=*x++;) \
else if(0==d%sizeof(UI ))RCFXLOOP(UI, pfx) \
else if(0==d%sizeof(UINT))RCFXLOOP(UINT,ipfx) \
else if(0==d%sizeof(US ))RCFXLOOP(US, spfx) \
else RCFXODDSIZE(pfx,bpfx) \
} /* commutative */
#endif
// no errors possible here
// this version, which processes by column and doesn't write out intermediates, will be better unless the arg is huge and blows out of cache. Solution to that would be to block the computation.
// m is # cells to operate on; n is # items in 1 such cell; d is # atoms in one such item
#define REDUCEBFX(f,pfx,ipfx,spfx,bpfx,vdo) \
AHDRP(f,B,B){ \
if(d==1){vdo; R EVOK;} /* if arg is byte list, do it */ \
I dipercell=(d+SZI-1)>>LGSZI; \
DQ(m, B *xx=x+(n-1)*d; B *zz=z; /* end-of-column scanner, start at end of first column; row result pointer */ \
DQ(dipercell, B *xxc=xx; I r=*(I*)xxc; /* init to end of column */ \
DQ(n-1, xxc-=d; r=pfx(*(I*)xxc,r);) /* roll up the column */ \
if(i){*(I*)zz=r; xx+=SZI; zz+=SZI;}else STOREBYTES(zz,r,-d&(SZI-1)); /* store valid bytes, advance to next column if there is one */ \
) x+=n*d; z+=d; /* advance to next cell */ \
) \
R EVOK; \
} /* non-commutative */
REDUCECFX( eqinsB, EQ, IEQ, SEQ, BEQ, vdone(m,n,x,z,(B)(n&1)))
REDUCECFX( neinsB, NE, INE, SNE, BNE, vdone(m,n,x,z,1 ))
REDUCECFX( orinsB, OR, IOR, SOR, BOR, {DQ(m, *z++=1&&memchr(x,C1,n); x+=n;)})
REDUCECFX( andinsB, AND, IAND, SAND, BAND, {DQ(m, *z++=! memchr(x,C0,n); x+=n;)})
REDUCEBFX( ltinsB, LT, ILT, SLT, BLT, {DQ(m, *z++= x[n-1]&&!memchr(x,C1,n-1)?1:0; x+=n;)})
REDUCEBFX( leinsB, LE, ILE, SLE, BLE, {DQ(m, *z++=!x[n-1]&&!memchr(x,C0,n-1)?0:1; x+=n;)})
REDUCEBFX( gtinsB, GT, IGT, SGT, BGT, {DQ(m, B *y=memchr(x,C0,n); *z++=1&&(y?1&(y-x):1&n); x+=n;)})
REDUCEBFX( geinsB, GE, IGE, SGE, BGE, {DQ(m, B *y=memchr(x,C1,n); *z++=! (y?1&(y-x):1&n); x+=n;)})
REDUCEBFX( norinsB, NOR, INOR, SNOR, BNOR, {DQ(m, B *y=memchr(x,C1,n); d=y?y-x:n; *z++=(1&d)==d<n-1; x+=n;)})
REDUCEBFX(nandinsB, NAND,INAND,SNAND,BNAND,{DQ(m, B *y=memchr(x,C0,n); d=y?y-x:n; *z++=(1&d)!=d<n-1; x+=n;)})
#if SY_ALIGN
AHDRR(plusinsB,I,B){
if(d==1){
for(;m;--m,x+=n){
*z++=bsum(n,x);
}
}else{
for(;m;--m,x+=n*d,z+=d){
// Adding items with more than 1 boolean. Process a columnar swath with 1 accumulator
UI *xu=(UI*)x; I dd; // start position in column swath. nn is # columns left
I *zz=z; // output pointer, switched if not valid
// Now do the SZI-byte swaths, possibly with overfetch
for(dd=d;dd>0;dd-=SZI,xu+=1){ // for all columns...
I acc20=0; I acc21=0; I acc22=0; I acc23=0; I acc24=0; I acc25=0; I acc26=0; I acc27=0;
UI *xu2=xu; I nn=n; // accumulator, address moving down the swath, number of rows left in swath
while(nn){ // till swath finished
I nloops=nn; nloops=nloops>255?255:nloops; // max 255 loops, each 4 words
nn-=nloops; // keep nn = # bytes remaining
I acc0=0;
DQ(nloops, acc0+=xu2[0]; xu2=(I*)((I)xu2+d);); // add up each byte-lane in the swatch
acc20+=acc0&255; acc0>>=8; acc21+=acc0&255; acc0>>=8; acc22+=acc0&255; acc0>>=8; acc23+=acc0&255;
#if SY_64
acc0>>=8; acc24+=acc0&255; acc0>>=8; acc25+=acc0&255; acc0>>=8; acc26+=acc0&255; acc0>>=8; acc27+=acc0&255;
#endif
}
// column is added. Write out the valid totals, divert the rest to the sink
*zz++=acc20; zz=dd<=1?&jt->shapesink[0]:zz; *zz++=acc21; zz=dd<=2?&jt->shapesink[0]:zz; *zz++=acc22; zz=dd<=3?&jt->shapesink[0]:zz; *zz++=acc23;
#if SY_64
zz=dd<=4?&jt->shapesink[0]:zz; *zz++=acc24; zz=dd<=5?&jt->shapesink[0]:zz; *zz++=acc25; zz=dd<=6?&jt->shapesink[0]:zz; *zz++=acc26; zz=dd<=7?&jt->shapesink[0]:zz; *zz++=acc27;
#endif
}
}
}
R EVOK;
}
#else
AHDRR(plusinsB,I,B){I dw,i,p,q,r,r1,s;UC*tu;UI*v;
if(d==1&n<SZI)DQ(m, s=0; DQ(n, s+=*x++;); *z++=s;)
else if(d==1){UI t;
p=n>>LGSZI; q=p/255; r=p%255; r1=n&(SZI-1); tu=(UC*)&t;
for(i=0;i<m;++i){
s=0; v=(UI*)x;
DO(q, t=0; DO(255, t+=*v++;); DO(SZI, s+=tu[i];));
t=0; DO(r, t+=*v++;); DO(SZI, s+=tu[i];);
x=(B*)v; DQ(r1, s+=*x++;);
*z++=s;
}}else{A t;UI*tv;
dw=(d+SZI-1)>>LGSZI; p=dw*SZI; mvc(m*d*SZI,z,1,MEMSET00);
q=n/255; r=n%255;
GA10(t,INT,dw); if(!t)R;
tu=UAV(t); tv=(UI*)tu; v=(UI*)x;
for(i=0;i<m;++i,z+=d){
DO(q, mvc(p,tv,1,MEMSET00); DO(255, DO(dw,tv[i]+=v[i];); x+=d; v=(UI*)x;); DO(d,z[i]+=tu[i];));
mvc(p,tv,1,MEMSET00); DO(r, DO(dw,tv[i]+=v[i];); x+=d; v=(UI*)x;); DO(d,z[i]+=tu[i];) ;
}}} /* +/"r w on boolean w, originally by Roger Moore */
#endif
#define PLUSI1(x) if(unlikely(__builtin_add_overflow((x),t,&t)))R EWOV;
#define MINUSI1(x) if(unlikely(__builtin_sub_overflow((x),t,&t)))R EWOV;
#define TYMESI1(x) if(unlikely(__builtin_mul_overflow((x),t,&t)))R EWOV;
// m is #cells, n is #items per cell, d is #atoms in item
#define REDUCEOVF(f,neut,ft1) \
AHDRR(f,I,I){I i; \
if(d==1){DQ(m, \
UI dlct=(n+3)>>2; \
UI dlct0=dlct; x+=(dlct-1)*(1LL<<2); I t=neut; switch(n&3){do{case 0: ft1(x[3]) case 3: ft1(x[2]) case 2: ft1(x[1]) case 1: ft1(x[0]) x-=4; }while(--dlct0);} \
*z++=t; x+=n+4; \
) R EVOK; \
} \
UI dlct=(n+3)>>2; I xstride1=d*SZI; I xstride3=3*xstride1; \
DQ(m, \
DQ(d, x+=d*(dlct-1)*(1LL<<2); \
UI dlct0=dlct; I t=neut; switch(n&3){do{case 0: ft1(*(I*)((C*)x+xstride3)) case 3: ft1(*(I*)((C*)x+2*xstride1)) case 2: ft1(*(I*)((C*)x+xstride1)) case 1: ft1(x[0]) x=(I*)((C*)x-4*xstride1); }while(--dlct0);} \
*z++=t; x=(I*)((C*)x+4*xstride1)+1; \
) \
x+=(n-1)*d; \
) \
R EVOK; \
}
REDUCEOVF( plusinsI, 0, PLUSI1)
REDUCEOVF(minusinsI, 0, MINUSI1)
REDUCEOVF(tymesinsI, 1, TYMESI1)
REDUCCPFX( plusinsO, D, I, PLUSO)
REDUCCPFX(minusinsO, D, I, MINUSO)
REDUCCPFX(tymesinsO, D, I, TYMESO)
#define redprim256rk1(prim,identity) \
__m256i endmask; /* length mask for the last word */ \
/* prim/ vectors */ \
__m256d idreg=_mm256_broadcast_sd(&identity); \
endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */ \
DQ(m, __m256d acc0=idreg; __m256d acc1=idreg; __m256d acc2=idreg; __m256d acc3=idreg; __m256d acc4=idreg; __m256d acc5=idreg; __m256d acc6=idreg; __m256d acc7=idreg; \
UI n2=DUFFLPCT(n-1,3); /* # turns through duff loop */ \
if(n2>0){ \
UI backoff=DUFFBACKOFF(n-1,3); \
x+=(backoff+1)*NPAR; \
switch(backoff){ \
do{ \
case -1: acc0=prim(acc0,_mm256_loadu_pd(x)); \
case -2: acc1=prim(acc1,_mm256_loadu_pd(x+1*NPAR)); \
case -3: acc2=prim(acc2,_mm256_loadu_pd(x+2*NPAR)); \
case -4: acc3=prim(acc3,_mm256_loadu_pd(x+3*NPAR)); \
case -5: acc4=prim(acc4,_mm256_loadu_pd(x+4*NPAR)); \
case -6: acc5=prim(acc5,_mm256_loadu_pd(x+5*NPAR)); \
case -7: acc6=prim(acc6,_mm256_loadu_pd(x+6*NPAR)); \
case -8: acc7=prim(acc7,_mm256_loadu_pd(x+7*NPAR)); \
x+=(1LL<<3)*NPAR; \
}while(--n2!=0); \
} \
} \
acc0=prim(acc0,_mm256_blendv_pd(idreg,_mm256_maskload_pd(x,endmask),_mm256_castsi256_pd(endmask))); x+=((n-1)&(NPAR-1))+1; \
acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); acc0=prim(acc0,acc4); \
acc2=prim(acc2,acc3); acc0=prim(acc0,acc1); acc0=prim(acc0,acc2); /* combine accumulators vertically */ \
acc0=prim(acc0,_mm256_permute4x64_pd(acc0,0b11111110)); acc0=prim(acc0,_mm256_permute_pd(acc0,0xf)); /* combine accumulators horizontally 01+=23, 0+=1 */ \
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *z=_mm256_cvtsd_f64(acc0); */ ++z; /* store the single result */ \
)
// f/ on rank>1, going down columns to save bandwidth
#define redprim256rk2(prim,identity) \
__m256i endmask; /* length mask for the last word */ \
__m256d idreg=_mm256_broadcast_sd(&identity); \
endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-d)&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */ \
I xstride1=d*SZD; I xstride3=3*xstride1; I xstride5=5*xstride1; I xstride7=xstride5+2*xstride1; \
DQ(m, D *x0; \
DQ((d-1)>>LGNPAR, \
x0=(D*)((C*)x-((-n)&((1LL<<3)-1))*xstride1); UI n0=(n+(1LL<<3)-1)>>3; __m256d acc0=idreg; __m256d acc1=idreg; __m256d acc2=idreg; __m256d acc3=idreg; __m256d acc4=idreg; __m256d acc5=idreg; __m256d acc6=idreg; __m256d acc7=idreg; \
switch(n&((1LL<<3)-1)){ \
do{ \
case 0: acc0=prim(acc0,_mm256_loadu_pd(x0)); \
case 7: acc1=prim(acc1,_mm256_loadu_pd((D*)((C*)x0+xstride1))); \
case 6: acc2=prim(acc2,_mm256_loadu_pd((D*)((C*)x0+2*xstride1))); \
case 5: acc3=prim(acc3,_mm256_loadu_pd((D*)((C*)x0+xstride3))); \
case 4: acc4=prim(acc4,_mm256_loadu_pd((D*)((C*)x0+4*xstride1))); \
case 3: acc5=prim(acc5,_mm256_loadu_pd((D*)((C*)x0+xstride5))); \
case 2: acc6=prim(acc6,_mm256_loadu_pd((D*)((C*)x0+2*xstride3))); \
case 1: acc7=prim(acc7,_mm256_loadu_pd((D*)((C*)x0+xstride7))); \
x0=(D*)((C*)x0+(1LL<<3)*xstride1); \
}while(--n0); \
} \
acc0=prim(acc0,acc4); acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); \
acc0=prim(acc0,acc1); acc2=prim(acc2,acc3); acc0=prim(acc0,acc2); _mm256_storeu_pd(z,acc0); \
x+=NPAR; z+=NPAR; \
) \
x0=(D*)((C*)x-((-n)&((1LL<<3)-1))*xstride1); UI n0=(n+(1LL<<3)-1)>>3; __m256d acc0=idreg; __m256d acc1=idreg; __m256d acc2=idreg; __m256d acc3=idreg; __m256d acc4=idreg; __m256d acc5=idreg; __m256d acc6=idreg; __m256d acc7=idreg; \
switch(n&((1LL<<3)-1)){ \
do{ \
case 0: acc0=prim(acc0,_mm256_maskload_pd(x0,endmask)); \
case 7: acc1=prim(acc1,_mm256_maskload_pd((D*)((C*)x0+xstride1),endmask)); \
case 6: acc2=prim(acc2,_mm256_maskload_pd((D*)((C*)x0+2*xstride1),endmask)); \
case 5: acc3=prim(acc3,_mm256_maskload_pd((D*)((C*)x0+xstride3),endmask)); \
case 4: acc4=prim(acc4,_mm256_maskload_pd((D*)((C*)x0+4*xstride1),endmask)); \
case 3: acc5=prim(acc5,_mm256_maskload_pd((D*)((C*)x0+xstride5),endmask)); \
case 2: acc6=prim(acc6,_mm256_maskload_pd((D*)((C*)x0+2*xstride3),endmask)); \
case 1: acc7=prim(acc7,_mm256_maskload_pd((D*)((C*)x0+xstride7),endmask)); \
x0=(D*)((C*)x0+(1LL<<3)*xstride1); \
}while(--n0); \
} \
acc0=prim(acc0,acc4); acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); \
acc0=prim(acc0,acc1); acc2=prim(acc2,acc3); acc0=prim(acc0,acc2); _mm256_maskstore_pd(z,endmask,acc0); \
x=x0-((d-1)&-NPAR); z+=((d-1)&(NPAR-1))+1; \
)
// prim for +/ II primplusII(x,y) _mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(x),_mm256_castpd_si256(y))); oflo=_mm256_or_pd(oflo,_mm256_andnot_pd(_mm256_xor_pd(x,y),_mm256_xor_pd(x,zz)));,
AHDRR(plusinsD,D,D){I i;D* RESTRICT y;
NAN0;
// latency of add is 2, so use 4 accumulators for 2 reads per cycle
if(d==1){
#if C_AVX2 || EMU_AVX2
redprim256rk1(_mm256_add_pd,dzero)
#else
DQ(m, I n0=n; D acc0=0.0; D acc1=0.0; D acc2=0.0; D acc3=0.0;
switch(n0&3){
loopback2:
case 0: acc0+=*x++; case 3: acc1+=*x++; case 2: acc2+=*x++; case 1: acc3+=*x++;
if((n0-=4)>0)goto loopback2;
}
acc0+=acc1; acc2+=acc3; *z++=acc0+acc2;
)
#endif
}
else{
#if C_AVX2 || EMU_AVX2
redprim256rk2(_mm256_add_pd,dzero)
#elif 1
// add down the columns to reduce memory b/w. 4 accumulators
DQ(m, D *x0;
DQ(d, x0=x;
I n0=n; D acc0=0.0; D acc1=0.0; D acc2=0.0; D acc3=0.0;
switch(n0&3){
loopback:
case 0: acc0+=*x0; x0+=d;
case 3: acc1+=*x0; x0+=d;
case 2: acc2+=*x0; x0+=d;
case 1: acc3+=*x0; x0+=d;
if((n0-=4)>0)goto loopback;
}
acc0+=acc1; acc2+=acc3; acc0+=acc2; *z++=acc0;
++x;
)
x=x0-(d-1);
)
#endif
}
R NANTEST?EVNAN:EVOK;
}
REDUCENAN( plusinsZ, Z, Z, zplus, plusZZ )
REDUCEPFX( plusinsX, X, X, xplus, plusXX, plusXX )
static OP1XYZ(plus,I,I2,I2,pfxplus)
OP1XYZ(plus,I,I2,I,pfxplus)
REDUCEPFX( plusinsI2, I, I2, pfxplus, plus1I2I2I, plus1I2II )
static OP1XYZ(plus,I,I4,I4,pfxplus)
OP1XYZ(plus,I,I4,I,pfxplus)
REDUCEPFX( plusinsI4, I, I4, pfxplus, plus1I4I4I, plus1I4II )
#if C_AVX2 || EMU_AVX2
// version for QP with AVX2. Bandwidth is not an issue, so we accumulate into memory for rank > 1
I plusinsE(I d,I n,I m,E* RESTRICTI x,E* RESTRICTI z,J jt){I i; // m is # cells to operate on; n is # items in 1 such cell; d is # atoms in one such item
if(d==1){x += m*n; z+=m;
__m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin); __m256d mantmask=_mm256_broadcast_sd((D*)&(I){0x000fffffffffffff}); /* needed masks: sign, mantissa */
/* we will read twice, masking. We may read the first half twice to avoid overfetch */
__m256i rdmask=_mm256_castps_si256(_mm256_permutevar_ps(_mm256_castpd_ps(_mm256_broadcast_sd((D*)&maskec4123[n&(NPAR-1)])),_mm256_loadu_si256((__m256i*)&validitymask[2]))); /* masks for the 2 reads */
for(i=m;i;--i){ /* for each accumulated row */
I rematom; /* # atoms to accumulate */
__m256d x0,x1,y0,y1,z0,z1;
x -= ((n-1)&(NPAR-1))+1; /* back up to beginning of last NPAR-atom section */
x0=_mm256_maskload_pd((D*)(x),rdmask);
x1=_mm256_maskload_pd((D*)(x+((n-1)&(NPAR/2))),_mm256_slli_epi64(rdmask,1));
SHUFIN(0,x0,x1,z0,z1);
for(rematom=(n-1)&-NPAR;rematom;rematom-=NPAR){ /* for each full batch */
x-=NPAR; x0=_mm256_loadu_pd((D*)x); x1=_mm256_loadu_pd((D*)(x+NPAR/2)); SHUFIN(0,x0,x1,y0,y1); /* read 4 values, put into lanes */
PLUSEE(y0,y1,z0,z1,z0,z1); /* add to total */
}
y0=_mm256_permute_pd(z0,0b1111); y1=_mm256_permute_pd(z1,0b1111); /* atoms 0+1, 2+3 */
PLUSEE(y0,y1,z0,z1,z0,z1);
y0=_mm256_permute4x64_pd(z0,0b10101010); y1=_mm256_permute4x64_pd(z1,0b10101010); /* atoms 0+2 */
PLUSEE(y0,y1,z0,z1,z0,z1);
CANONE(z0,z1)
--z; *(I*)&z->hi=_mm256_extract_epi64(_mm256_castpd_si256(z0),0x0); *(I*)&z->lo=_mm256_extract_epi64(_mm256_castpd_si256(z1),0x0);
}
}else{z+=(m-1)*d; x+=(m*n-1)*d;
for(i=0;i<m;++i,z-=d){I rc;
E* RESTRICT y=x; x-=d; if(255&(rc=plusEE(1,d,x,y,z,jt), rc=rc<0?EWOVIP+EWOVIPMULII:rc))R rc; x-=d;
DQ(n-2, if(255&(rc=plusEE(1,d,x,z,z,jt), rc=rc<0?EWOVIP+EWOVIPMULII:rc))R rc; x-=d;);
}
}
R EVOK;
}
#endif
REDUCEPFX(minusinsB, I, B, MINUS, minusBB, minusBI )
REDUCENAN(minusinsD, D, D, MINUS, minusDD )
REDUCENAN(minusinsZ, Z, Z, zminus, minusZZ)
REDUCEPFX(tymesinsD, D, D, TYMES, tymesDD, tymesDD )
REDUCEPFX(tymesinsZ, Z, Z, ztymes, tymesZZ, tymesZZ)
REDUCENAN( divinsD, D, D, DIV, divDD )
REDUCENAN( divinsZ, Z, Z, zdiv, divZZ )
REDUCEPFXIDEM2( maxinsI, I, I, MAX, maxII )
REDUCEPFXIDEM2PRIM256( maxinsD, D, D, MAX, maxDD, _mm256_max_pd, infm )
REDUCEPFX( maxinsX, X, X, XMAX, maxXX , maxXX )
REDUCEPFX( maxinsS, SB,SB,SBMAX, maxSS, maxSS )
REDUCEPFXIDEM2( mininsI, I, I, MIN, minII )
REDUCEPFXIDEM2PRIM256( mininsD, D, D, MIN, minDD, _mm256_min_pd, inf )
REDUCEPFX( mininsX, X, X, XMIN, minXX, minXX )
REDUCEPFX( mininsS, SB,SB,SBMIN, minSS, minSS )
// +/!.0"r, compensated summation
static DF1(jtreduce); // forward declaration
DF1(jtcompsum){
ARGCHK1(w)
I wr=AR(w); I *ws=AS(w);
// Create r: the effective rank; f: length of frame; n: # items in a CELL of w
I r=(RANKT)jt->ranks; r=wr<r?wr:r; I f=wr-r; I n; SETICFR(w,f,r,n); // no RESETRANK
// if the argument is not float, or if there are not more than 2 items, process as normal +/
if(unlikely((-(AT(w)&FL)&(2-n))>=0))R reduce(w,FAV(self)->fgh[0]);
// calculate cell sizes and allocate the result
I d; PROD(d,r-1,f+ws+1); // */ }. $ cell
// m=*/ frame (i. e. #cells to operate on)
// r cannot be 0 (would be handled above). Calculate low part of zn first
I m; PROD(m,f,ws);
// Allocate the result area
A z; GATV(z,FL,m*d,MAX(0,wr-1),ws); if(1<r)MCISH(f+AS(z),f+1+ws,r-1); // allocate, and install shape below the frame
if(unlikely(m*d==0)){RETF(z);} // mustn't call the function on an empty argument!
// Do the operation
NAN0;
D *wv=DAV(w), *zv=DAV(z);
#if C_AVX2 || EMU_AVX2
__m256d __attribute__((aligned(64))) accc[2][8]; // accumulators and error terms
__m256i endmask; /* length mask for the last word */
if(d==1){
// rank-1 case: operate across the row, with 4 accumulators
endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
for(;m>0;--m){
__m256d acc0; __m256d acc1; __m256d acc2; __m256d acc3;
__m256d c0; __m256d c1; __m256d c2; __m256d c3; // error terms
// KAHAN has a dependency loop of 4 instructions, thus latency of 16 cycles (Skylake). It takes unrolling by 8 to get the latency to match
// the launch rate of 2 cycles per iteration. This requires spilling results to memory, which will add more latency, but it's still better than unrolling only 4.
// Unfortunately, clang goes nuts trying to handle this loop, and ends up doing 14 loads and 14 stores for every iteration. So we have to unroll it
// by hand using an array to hold the state
mvc(sizeof(accc),accc,1,MEMSET00);
UI nlp=(n-1)>>LGNPAR;
for(;nlp;--nlp){KAHANA(_mm256_loadu_pd(wv),nlp&7) wv+=NPAR;}
KAHANA(_mm256_maskload_pd(wv,endmask),7) wv+=((n-1)&(NPAR-1))+1;
__m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin);
// add all the low parts together into c0 - the low bits of the low will not make it through to the result
#if 0 // clang
c1=_mm256_add_pd(c1,c5); c2=_mm256_add_pd(c2,c6); c3=_mm256_add_pd(c3,c7); c0=_mm256_add_pd(c0,c4);
#else
c0=_mm256_add_pd(accc[1][6],accc[1][5]); c1=_mm256_add_pd(accc[1][4],accc[1][3]); c2=_mm256_add_pd(accc[1][2],accc[1][1]); c3=_mm256_add_pd(accc[1][0],accc[1][7]);
#endif
c0=_mm256_add_pd(c0,c1); c2=_mm256_add_pd(c2,c3); c0=_mm256_add_pd(c0,c2);
// TWOSUM the accumulators into acc0, adding all the resulting low parts into c0
#if 0 // clang
TWOSUM(acc0,acc4,acc0,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(acc1,acc5,acc1,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(acc2,acc6,acc2,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(acc3,acc7,acc3,c1) c0=_mm256_add_pd(c0,c1);
#else
TWOSUM(accc[0][6],accc[0][5],acc0,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][4],accc[0][3],acc1,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(accc[0][2],accc[0][1],acc2,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][0],accc[0][7],acc3,c1) c0=_mm256_add_pd(c0,c1);
#endif
TWOSUM(acc0,acc1,acc0,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(acc2,acc3,acc2,c1) c0=_mm256_add_pd(c0,c1); // add 0+1, 2+3; combine all low parts of Kahan corrections into low total
TWOSUM(acc0,acc2,acc0,c1) c0=_mm256_add_pd(c0,c1); // 0+2, bringing along low part
// acc0/c0 survive. Combine horizontally
c0=_mm256_add_pd(c0,_mm256_permute4x64_pd(c0,0b11111110)); acc1=_mm256_permute4x64_pd(acc0,0b11111110); // c0: 01+=23, acc1<-23
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine p=01+23
c0=_mm256_add_pd(c0,_mm256_permute_pd(c0,0xf)); acc1=_mm256_permute_pd(acc0,0xf); // combine c0+c1, acc1<-1
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine 0123, combine all low parts
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *zv=_mm256_cvtsd_f64(acc0);*/ ++zv; // store the single result
}
}else{
// rank>1, going down columns to save bandwidth and add accuracy
endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-d)&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
DQ(m, D *wv0; I n0;
__m256d y; __m256d t; // new input value, temp to hold high part of sum
__m256d acc0; __m256d acc1; __m256d acc2; __m256d acc3; __m256d c0; __m256d c1; __m256d c2; __m256d c3; // accumulators, error terms
DQ((d-1)>>LGNPAR,
wv0=wv;
#if 0 // clang
n0=n; acc0=acc1=acc2=acc3=c0=c1=c2=c3=_mm256_setzero_pd();
switch(n0&3){
label1:
case 0: KAHAN(_mm256_loadu_pd(wv0),0) wv0+=d; case 3: KAHAN(_mm256_loadu_pd(wv0),1) wv0+=d; case 2: KAHAN(_mm256_loadu_pd(wv0),2) wv0+=d; case 1: KAHAN(_mm256_loadu_pd(wv0),3) wv0+=d;
if((n0-=4)>0)goto label1;
}
#else
mvc(sizeof(accc),accc,1,MEMSET00);
UI nlp=n;
for(;nlp;--nlp){KAHANA(_mm256_loadu_pd(wv0),nlp&7) wv0+=d;}
#endif
__m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin);
// combine accumulators
c0=_mm256_add_pd(accc[1][6],accc[1][5]); c1=_mm256_add_pd(accc[1][4],accc[1][3]); c2=_mm256_add_pd(accc[1][2],accc[1][1]); c3=_mm256_add_pd(accc[1][0],accc[1][7]);
c0=_mm256_add_pd(c0,c1); c2=_mm256_add_pd(c2,c3); c0=_mm256_add_pd(c0,c2); // add all the low parts together - the low bits of the low will not make it through to the result
TWOSUM(accc[0][7],accc[0][6],acc0,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][5],accc[0][4],acc1,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(accc[0][3],accc[0][2],acc2,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][1],accc[0][0],acc3,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(acc0,acc1,acc0,c1) TWOSUM(acc2,acc3,acc2,c2) c2=_mm256_add_pd(c1,c2); c0=_mm256_sub_pd(c2,c0); // add 0+1, 2+3 KAHAN corrections are negative - change that now
TWOSUM(acc0,acc2,acc0,c1) c0=_mm256_add_pd(c1,c0); // 0+2
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
_mm256_storeu_pd(zv,acc0); wv+=NPAR; zv+=NPAR;
)
// repeat for partial column
wv0=wv;
#if 0 // clang
n0=n; acc0=acc1=acc2=acc3=c0=c1=c2=c3=_mm256_setzero_pd();
switch(n0&3){
label2:
case 0: KAHAN(_mm256_maskload_pd(wv0,endmask),0) wv0+=d; case 3: KAHAN(_mm256_maskload_pd(wv0,endmask),1) wv0+=d; case 2: KAHAN(_mm256_maskload_pd(wv0,endmask),2) wv0+=d; case 1: KAHAN(_mm256_maskload_pd(wv0,endmask),3) wv0+=d;
if((n0-=4)>0)goto label2;
}
#else
mvc(sizeof(accc),accc,1,MEMSET00);
UI nlp=n;
for(;nlp;--nlp){KAHANA(_mm256_maskload_pd(wv0,endmask),nlp&7) wv0+=d;}
#endif
__m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin);
c0=_mm256_add_pd(accc[1][6],accc[1][5]); c1=_mm256_add_pd(accc[1][4],accc[1][3]); c2=_mm256_add_pd(accc[1][2],accc[1][1]); c3=_mm256_add_pd(accc[1][0],accc[1][7]);
c0=_mm256_add_pd(c0,c1); c2=_mm256_add_pd(c2,c3); c0=_mm256_add_pd(c0,c2); // add all the low parts together - the low bits of the low will not make it through to the result
TWOSUM(accc[0][7],accc[0][6],acc0,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][5],accc[0][4],acc1,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(accc[0][3],accc[0][2],acc2,c1) c0=_mm256_add_pd(c0,c1); TWOSUM(accc[0][1],accc[0][0],acc3,c1) c0=_mm256_add_pd(c0,c1);
TWOSUM(acc0,acc1,acc0,c1) TWOSUM(acc2,acc3,acc2,c2) c2=_mm256_add_pd(c1,c2); c0=_mm256_sub_pd(c2,c0); // add 0+1, 2+3 KAHAN corrections are negative - change that now
TWOSUM(acc0,acc2,acc0,c1) c0=_mm256_add_pd(c1,c0); // 0+2
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
_mm256_maskstore_pd(zv,endmask,acc0); wv=wv0-((d-1)&-NPAR); zv+=((d-1)&(NPAR-1))+1;
)
}
#else
D t, y;
if(d==1){
DQ(m, D acc=0.0; D c=0.0; DQ(n, y=*wv-c; t=acc+y; acc=t-acc; c=acc-y; acc=t; ++wv;) *zv++=acc;)
}else{
// add down the columns to reduce memory b/w. 4 accumulators
DQ(m, D *wv0; DQ(d, wv0=wv; D acc=0.0; D c=0.0; DQ(n, y=*wv0-c; t=acc+y; acc=t-acc; c=acc-y; acc=t; wv0+=d;) *zv++=acc; ++wv;) wv=wv0-(d-1); )
}
#endif
if(unlikely(NANTEST))R reduce(w,FAV(self)->fgh[0]); // in NaN error, fail over to normal summation. Infinities can cause it. Ranks still set
RETF(z);
}
static DF1(jtred0){DECLF;A x,z;I f,r,wr,*s;
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; RESETRANK; s=AS(w);
if(likely(!ISSPARSE(AT(w)))){GA(x,AT(w),0L,r,f+s);}else{GASPARSE(x,AT(w),1,r,f+s);}
R reitem(vec(INT,f,s),lamin1(df1(z,x,(AT(w)&SBT)?idensb(fs):iden(fs))));
} /* f/"r w identity case */
// general reduce. We inplace the results into the next iteration. This routine cannot inplace its inputs.
static DF1(jtredg){F1PREFIP;PROLOG(0020);DECLF;AD * RESTRICT a;I i,n,r,wr;
ARGCHK1(w);
ASSERT(!ISSPARSE(AT(w)),EVNONCE);
// loop over rank
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; RESETRANK;
if(r<wr)R rank1ex(w,self,r,jtredg);
// From here on we are doing a single reduction
n=AS(w)[0]; // n=#cells
// Allocate virtual block for the running x argument.
fauxblock(virtafaux); fauxvirtual(a,virtafaux,w,r-1,ACUC1); // allocate UNINCORPORABLE block
// wfaux will hold the result from the iterations. Init to value of last cell
// Allocate fauxvirtual arg for the first cell, so it can be inplaceable/pristine if needed (tail returned a virtual block, which messed things up for high rank)
fauxblock(virtwfaux); A wfaux; fauxvirtual(wfaux,virtwfaux,w,r-1,ACUC1); // allocate UNINCORPORABLE block, mark inplaceable - used only once
// finish filling the virt block
A *old=jt->tnextpushp; // save stack mark for subsequent frees. We keep the a argument over the calls, but allow the w to be deleted
// fill in the shape, offset, and item-count of the virtual block
I k; PROD(k,r-1,AS(w)+1); // k=#atoms of cell of w
AN(wfaux)=k; AN(a)=k;
k<<=bplg(AT(w)); // k now=length of input cell in bytes, where it will remain
AK(wfaux)+=(n-1)*k; AK(a)+=(n-2)*k; MCISH(AS(wfaux),AS(w)+1,r-1); MCISH(AS(a),AS(w)+1,r-1); // make the virtual block look like the tail, except for the offset
// Calculate inplaceability. We can inplace the left arg, which is always virtual, if w is inplaceable and (w is direct or fs is &.>)
// and the input jtinplace. We turn off WILLBEOPENED status in jtinplace for the callee.
// We include contextual inplaceability (from jtinplace) here because if the block is returned, its pristinity will be checked if it is inplaceable. Thus
// we do not want to call a faux argument inplaceable if it really isn't. This gives us leeway with jtinplace itself
I aipok = (SGNIF((I)jtinplace&(((AT(w)&TYPEVIPOK)!=0)|f2==jtevery2self),JTINPLACEWX)&AC(w))+ACUC1; // requires JTINPLACEWX==0. This is 1 or 8..1
// We can inplace the right arg the first time if it is direct inplaceable, and always after that (assuming it is an inplaceable result).
ACINIT(wfaux,aipok) // first cell is inplaceable if second is
jtinplace = (J)(intptr_t)(((I)jt) + (JTINPLACEW+JTINPLACEA)*((FAV(fs)->flag>>(VJTFLGOK2X-JTINPLACEWX)) & JTINPLACEW)); // all items are used only once
// We need to free memory in case the called routine leaves it unfreed (that's bad form & we shouldn't expect it), and also to free the result of the
// previous iteration. We don't want to free every time, though, because that does ra() on w which could be a costly traversal if it's a nonrecursive recursible type.
// As a compromise we free every few iterations: at least one per 8 iterations, and at least 8 times through the process
#define LGMINGCS 3 // lg2 of minimum number of times we call gc
#define MINGCINTERVAL 8 // max spacing between frees
I freedist=MIN((n+((1<<LGMINGCS)-1))>>LGMINGCS,MINGCINTERVAL); I freephase=freedist;
i=n-1; while(1){ // for each cell except the last
ACRESET(a,aipok) // in case we created a virtual block from it, restore inplaceability to the UNINCORPABLE block
RZ(wfaux=CALL2IP(f2,a,wfaux,fs));
if(--i==0)break; // stop housekeeping after last iteration
// if w happens to be the same virtual block that we passed in as x, we have to clone it before we change the pointer
if(unlikely(a==wfaux)){RZ(wfaux=virtual(wfaux,0,AR(a))); AN(wfaux)=AN(a); MCISH(AS(wfaux),AS(a),AR(a));}
if(--freephase==0){wfaux=gc(wfaux,old); freephase=freedist;} // free the buffers we allocated, except for the result
// move to next input cell
AK(a) -= k;
}
// At the end of all this, it is possible that the result is the original first or last cell, resting in its original virtual block.
// In that case, we have to realize it, so that we don't let the fauxvirtual block escape
if(unlikely((AFLAG(wfaux)&AFUNINCORPABLE)!=0))wfaux=realize(wfaux);
EPILOG(wfaux); // this frees the virtual block, at the least
} /* f/"r w for general f and 1<(-r){$w */
static const C fca[]={CSTAR, CPLUS, CEQ, CMIN, CMAX, CPLUSDOT, CSTARDOT, CNE, 0}; /* commutative & associative */
static A jtredsp1a(J jt,C id,A z,A e,I n,I r,I*s){A t;B b,p=0;D d=1;
switch(id){
default: ASSERT(0,EVNONCE);
case CPLUSDOT: R n?gcd(z,e):ca(e);
case CSTARDOT: R n?lcm(z,e):ca(e);
case CMIN: R n?minimum(z,e):ca(e);
case CMAX: R n?maximum(z,e):ca(e);
case CPLUS: if(n&&equ(e,num(0)))R z; DO(r, d*=s[i];); t=tymes(e,d>=FLIMAX?scf(d-n):sc((I)d-n)); R n?plus(z,t):t;
case CSTAR: if(n&&equ(e,num(1) ))R z; DO(r, d*=s[i];); t=expn2(e,d>=FLIMAX?scf(d-n):sc((I)d-n)); R n?tymes(z,t):t;
case CEQ: p=1; /* fall thru */
case CNE:
ASSERT(B01&AT(e),EVNONCE);
if(!n)BAV(z)[0]=p;
b=1; DO(r, if(!(s[i]&1)){b=0; break;});
R !p==*BAV(e)&&b!=(n&1)?not(z):z;
}} /* f/w on sparse vector w, post processing */
static A jtredsp1(J jt,A w,A self,C id,VARPSF ado,I cv,I f,I r,I zt){A e,x,z;I m,n;P*wp;
ARGCHK1(w);
wp=PAV(w); e=SPA(wp,e); x=SPA(wp,x); n=AN(x); m=AS(w)[0];
GA00(z,zt,1,0);
if(n){I rc=((AHDRRFN*)ado)(1L,n,1L,AV(x),AV(z),jt); if(255&rc)jsignal(rc); RE(0); if(m==n)R z;}
R redsp1a(id,z,e,n,AR(w),AS(w));
} /* f/"r w for sparse vector w */
DF1(jtredravel){A f,x,z;I n;P*wp;
F1PREFIP;PROLOG(0000);
ARGCHK1(w);
f=FAV(self)->fgh[0]; // f/
if(likely(!ISSPARSE(AT(w)))){RZ(z=reduce(jtravel(jtinplace,w),f));
}else{
// The rest is sparse
wp=PAV(w); x=SPA(wp,x); n=AN(x);
I rc=EVOK;
while(1){ // Loop to handle restart on overflow
VARPS adocv; varps(adocv,f,AT(x),0);
ASSERT(adocv.f,EVNONCE);
GA00(z,rtype(adocv.cv),1,0);
if(n){rc=((AHDRRFN*)adocv.f)((I)1,n,(I)1,AV(x),AV(z),jt);} // mustn't adocv on empty
rc&=255; if(rc)jsignal(rc); if(rc<EWOV){if(rc)R0; RZ(z=redsp1a(FAV(FAV(f)->fgh[0])->id,z,SPA(wp,e),n,AR(w),AS(w))); break;} // since f has an insert fn, its id must be OK
}
}
EPILOG(z);
} /* f/@, w */
static A jtredspd(J jt,A w,A self,C id,VARPSF ado,I cv,I f,I r,I zt){A a,e,x,z,zx;I c,m,n,*s,t,*v,wr,*ws,xf,xr;P*wp,*zp;
ARGCHK1(w);
ASSERT(strchr(fca,id),EVNONCE);
wp=PAV(w); a=SPA(wp,a); e=SPA(wp,e); x=SPA(wp,x); s=AS(x);
xr=r; v=AV(a); DO(AN(a), if(f<v[i])--xr;); xf=AR(x)-xr;
m=prod(xf,s); c=m?AN(x)/m:0; n=s[xf];
GA(zx,zt,AN(x)/n,AR(x)-1,s); MCISH(xf+AS(zx),1+xf+s,xr-1);
I rc=((AHDRRFN*)ado)(c/n,n,m,AV(x),AV(zx),jt); if(255&rc)jsignal(rc); RE(0);
switch(id){
case CPLUS: if(!equ(e,num(0)))RZ(e=tymes(e,sc(n))); break;
case CSTAR: if(!equ(e,num(1) )&&!equ(e,num(0)))RZ(e=expn2(e,sc(n))); break;
case CEQ: ASSERT(B01&AT(x),EVNONCE); if(!BAV(e)[0]&&0==(n&1))e=num(1); break;
case CNE: ASSERT(B01&AT(x),EVNONCE); if( BAV(e)[0]&&1==(n&1))e=num(0);
}
if(TYPESNE(AT(e),AT(zx))){t=maxtypene(AT(e),AT(zx)); if(TYPESNE(t,AT(zx)))RZ(zx=cvt(t,zx));}
wr=AR(w); ws=AS(w);
GASPARSE(z,STYPE(AT(zx)),1,wr-1,ws); if(1<wr)MCISH(f+AS(z),f+1+ws,wr-1);
zp=PAV(z);
RZ(a=ca(a)); v=AV(a); DO(AN(a), if(f<v[i])--v[i];);
SPB(zp,a,a);
SPB(zp,e,cvt(AT(zx),e));
SPB(zp,i,SPA(wp,i));
SPB(zp,x,zx);
R z;
} /* f/"r w for sparse w, rank > 1, dense axis */
static B jtredspsprep(J jt,C id,I f,I zt,A a,A e,A x,A y,I*zm,I**zdv,B**zpv,I**zqv,C**zxxv,A*zsn){
A d,p,q,sn=0,xx;B*pv;C*xxv;I*dv,j,k,m,mm,*qv=0,*u,*v,yc,yr,yr1,*yv;
v=AS(y); yr=v[0]; yc=v[1]; yr1=yr-1;
RZ(d=grade1(eq(a,sc(f)))); dv=AV(d);
DO(AN(a), if(i!=dv[i]){RZ(q=grade1p(d,y)); qv=AV(q); break;});
GATV0(p,B01,yr,1); pv=BAV(p); mvc(yr,pv,1,MEMSET00);
u=yv=AV(y); m=mm=0; j=-1; if(qv)v=yv+yc*qv[0];
for(k=0;k<yr1;++k){
if(qv){u=v; v=yv+yc*qv[1+k];}else v=u+yc;
DO(yc-1, if(u[dv[i]]!=v[dv[i]]){++m; pv[k]=1; mm=MAX(mm,k-j); j=k; break;});
if(!qv)u+=yc;
}
if(yr){++m; pv[yr1]=1; mm=MAX(mm,yr1-j);}
if(qv){j=mm*aii(x); GA10(xx,AT(x),j); xxv=CAV(xx);}
switch(id){
case CPLUS: case CPLUSDOT: j=!equ(e,num(0)); break;
case CSTAR: case CSTARDOT: j=!equ(e,num(1)); break;
case CMIN: j=!equ(e,zt&B01?num(1) :zt&INT?sc(IMAX):ainf ); break;
case CMAX: j=!equ(e,zt&B01?num(0):zt&INT?sc(IMIN):scf(infm)); break;
case CEQ: j=!*BAV(e); break;
case CNE: j= *BAV(e); break;
}
if(j)GATV0(sn,INT,m,1);
*zm=m; *zdv=dv; *zpv=pv; *zqv=qv; *zxxv=xxv; *zsn=sn;
R 1;
}
static B jtredspse(J jt,C id,I wm,I xt,A e,A zx,A sn,A*ze,A*zzx){A b;B nz;I t,zt;
RZ(b=ne(num(0),sn)); nz=!all0(b); zt=AT(zx);
switch(id){
case CPLUS: if(nz)RZ(zx=plus (zx, tymes(e,sn) )); RZ(e= tymes(e,sc(wm)) ); break;
case CSTAR: if(nz)RZ(zx=tymes(zx,bcvt(1,expn2(e,sn)))); RZ(e=bcvt(1,expn2(e,sc(wm)))); break;
case CPLUSDOT: if(nz)RZ(zx=gcd(zx,from(b,over(num(0),e)))); break;
case CSTARDOT: if(nz)RZ(zx=lcm(zx,from(b,over(num(1),e)))); break;
case CMIN: if(nz)RZ(zx=minimum(zx,from(b,over(zt&B01?num(1): zt&INT?sc(IMAX):ainf, e)))); break;
case CMAX: if(nz)RZ(zx=maximum(zx,from(b,over(zt&B01?num(0):zt&INT?sc(IMIN):scf(infm),e)))); break;
case CEQ: ASSERT(B01&xt,EVNONCE); if(nz)RZ(zx=eq(zx,eq(num(0),residue(num(2),sn)))); if(!(wm&1))e=num(1); break;
case CNE: ASSERT(B01&xt,EVNONCE); if(nz)RZ(zx=ne(zx,eq(num(1), residue(num(2),sn)))); if(!(wm&1))e=num(0); break;
}
if(TYPESNE(AT(e),AT(zx))){t=maxtypene(AT(e),AT(zx)); if(TYPESNE(t,AT(zx)))RZ(zx=cvt(t,zx));}
*ze=e; *zzx=zx;
R 1;
}
static A jtredsps(J jt,A w,A self,C id,VARPSF ado,I cv,I f,I r,I zt){A a,a1,e,sn,x,x1=0,y,z,zx,zy;B*pv;
C*xv,*xxv,*zv;I*dv,i,m,n,*qv,*sv,*v,wr,xk,xt,wm,*ws,xc,yc,yr,*yu,*yv,zk;P*wp,*zp;
ARGCHK1(w);
ASSERT(strchr(fca,id),EVNONCE);
wr=AR(w); ws=AS(w); wm=ws[f];
wp=PAV(w); a=SPA(wp,a); e=SPA(wp,e);
y=SPA(wp,i); v=AS(y); yr=v[0]; yc=v[1]; yv=AV(y);
x=SPA(wp,x); xt=AT(x); xc=aii(x);
RZ(redspsprep(id,f,zt,a,e,x,y,&m,&dv,&pv,&qv,&xxv,&sn));
xv=CAV(x); xk=xc<<bplg(xt);
GA(zx,zt,m*xc,AR(x),AS(x)); AS(zx)[0]=m; zv=CAV(zx); zk=xc<<bplg(zt);
GATV0(zy,INT,m*(yc-1),2); v=AS(zy); v[0]=m; v[1]=yc-1; yu=AV(zy);
v=qv; if(sn)sv=AV(sn);
for(i=0;i<m;++i){A y;B*p1;C*u;I*vv;
p1=1+(B*)memchr(pv,C1,yr); n=p1-pv; if(sn)sv[i]=wm-n; pv=p1;
vv=qv?yv+yc**v:yv; DO(yc-1, *yu++=vv[dv[i]];);
if(1<n){if(qv){u=xxv; DO(n, MC(u,xv+xk*v[i],xk); u+=xk;);} I rc=((AHDRRFN*)ado)(xc,n,1L,qv?xxv:xv,zv,jt); if(255&rc)jsignal(rc); RE(0);}
else if(zk==xk)MC(zv,qv?xv+xk**v:xv,xk);
else {if(!x1)GA10(x1,xt,xc); MC(AV(x1),qv?xv+xk**v:xv,xk); RZ(y=cvt(zt,x1)); MC(zv,AV(y),zk);}
zv+=zk; if(qv)v+=n; else{xv+=n*xk; yv+=n*yc;}
}
if(sn)RZ(redspse(id,wm,xt,e,zx,sn,&e,&zx));
RZ(a1=ca(a)); v=AV(a1); n=0; DO(AN(a), if(f!=v[i])v[n++]=v[i]-(I )(f<v[i]););
GASPARSE(z,STYPE(AT(zx)),1,wr-1,ws); if(1<r)MCISH(f+AS(z),f+1+ws,r-1);
zp=PAV(z);
SPB(zp,a,vec(INT,n,v));
SPB(zp,e,cvt(AT(zx),e));
SPB(zp,x,zx);
SPB(zp,i,zy);
R z;
} /* f/"r w for sparse w, rank > 1, sparse axis */
static DF1(jtreducesp){A a,g,z;B b;I f,n,r,*v,wn,wr,*ws,wt,zt;P*wp;
ARGCHK1(w);J jtinplace=jt;
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; // no RESETRANK
wn=AN(w); ws=AS(w); n=r?ws[f]:1;
wt=AT(w); wt=wn?DTYPE(wt):B01;
g=VAV(self)->fgh[0]; // g is the f in f/
if(!n)R red0(w,self); // red0 uses ranks, and resets them
C id; if(AT(g)&VERB){id=FAV(g)->id; id=FAV(g)->flag&VISATOMIC2?id:0;}else id=0;
VARPS adocv; varps(adocv,self,wt,0);
if(2==n&&!(adocv.f&&strchr(fca,id))){
A x; IRS2(num(0),w,0L,0,r,jtfrom,x); A y; IRS2(num(1),w,0L,0,r,jtfrom,y);
R df2(z,x,y,g); // rank has been reset for this call
}
// original rank still set
if(!adocv.f)R redg(w,self);
if(1==n)R tail(w);
zt=rtype(adocv.cv);
RESETRANK;
if(1==wr)z=redsp1(w,self,id,adocv.f,adocv.cv,f,r,zt);
else{
wp=PAV(w); a=SPA(wp,a); v=AV(a);
b=0; DO(AN(a), if(f==v[i]){b=1; break;});
z=b?redsps(w,self,id,adocv.f,adocv.cv,f,r,zt):redspd(w,self,id,adocv.f,adocv.cv,f,r,zt);
}
R jt->jerr>=EWOV?IRS1(w,self,r,jtreducesp,z):z;
} /* f/"r for sparse w */
#define BR2CASE(t,id) ((((id)-CSTARCO)*7)+((0x160008>>(t))&7)) // unique inputs are 0 1 2 3 16 17 18-> 0 4 2 1 6 3 5 10110 .... .... .... 1000
// perform expression op on x and y, which have booleans in alternate bytes
// m is the number of bytes
#if SY_64
#define BOOLPAIRS(op) \
{I *u=(I*)wv,x,y; UI4 *zvui=(UI4*)zv; /* running pointer, operand areas */ \
DQ((m+SZI-1)>>(LGSZI-1), x=*u++; y=(x>>8); x=op; x&=ALTBYTES; x|=x>>8; x&=ALTSHORTS; x|=x>>16; x&=VALIDBOOLEAN; *zvui++=(UI4)x;)}
#define LITEQ(op) \
{I *u=(I*)wv,x,y; UI4 *zvui=(UI4*)zv; /* running pointer, operand areas */ \
DQ((m+SZI-1)>>(LGSZI-1), x=*u++; y=(x>>8); x&=ALTBYTES; y&=ALTBYTES; x=(op((x^y)+ALTBYTES)>>8)&(ALTBYTES&VALIDBOOLEAN); x|=x>>8; x&=ALTSHORTS; x|=x>>16; *zvui++=(UI4)x;)}
#else
#define BOOLPAIRS(op) \
{I *u=(I*)wv,x,y; US *zvus=(US*)zv; /* running pointer, operand areas */ \
DQ((m+SZI-1)>>(LGSZI-1), x=*u++; y=(x>>8); x=op; x&=ALTBYTES; x|=x>>8; x&=VALIDBOOLEAN; *zvus++=(US)x;)}
#define LITEQ(op) \
{I *u=(I*)wv,x,y; US *zvus=(US*)zv; /* running pointer, operand areas */ \
DQ((m+SZI-1)>>(LGSZI-1), x=*u++; y=(x>>8); x&=ALTBYTES; y&=ALTBYTES; x=(op((x^y)+ALTBYTES)>>8)&(ALTBYTES&VALIDBOOLEAN); x|=x>>8; *zvus++=(US)x;)}
#endif
#define TYPEDPAIRS(T,op) {T*u=(T*)wv,x,y; DQ(m, x=*u++; y=*u++; *zv++=op;)}
// compare literals for equality/inequality. op is ~ for ==
// handle reduction along final axis of length 2. cv is case# to run. Return allocated result
static A jtreduce2(J jt,A w,I cv,I f){A z=(A)1;B *zv;I m,*ws;
I wn=AN(w); I wr=AR(w); ws=AS(w); void *wv=voidAV(w);
PROD(m,f,ws); // number of cells, each 2 atoms
GATV(z,B01,wn>>1,wr-1,ws); zv=BAV(z);
switch(cv){
case BR2CASE(B01X,CEQ ): BOOLPAIRS(~x^y) break;
case BR2CASE(B01X,CNE ): BOOLPAIRS(x^y) break;
case BR2CASE(B01X,CLT ): BOOLPAIRS(~x&y) break;
case BR2CASE(B01X,CLE ): BOOLPAIRS(~x|y) break;
case BR2CASE(B01X,CGT ): BOOLPAIRS(x&~y) break;
case BR2CASE(B01X,CGE ): BOOLPAIRS(x|~y) break;
case BR2CASE(B01X,CMAX ):
case BR2CASE(B01X,CPLUSDOT): BOOLPAIRS(x|y) break;
case BR2CASE(B01X,CPLUSCO ): BOOLPAIRS(~(x|y)) break;
case BR2CASE(B01X,CMIN ):
case BR2CASE(B01X,CSTAR ):
case BR2CASE(B01X,CSTARDOT): BOOLPAIRS(x&y) break;
case BR2CASE(B01X,CSTARCO ): BOOLPAIRS(~(x&y)) break;
case BR2CASE(LITX,CEQ ): LITEQ(~); break;
case BR2CASE(LITX,CNE ): LITEQ(); break;
case BR2CASE(C2TX,CEQ ): TYPEDPAIRS(US,x==y); break;
case BR2CASE(C2TX,CNE ): TYPEDPAIRS(US,x!=y); break;
case BR2CASE(C4TX,CEQ ): TYPEDPAIRS(C4,x==y); break;
case BR2CASE(C4TX,CNE ): TYPEDPAIRS(C4,x!=y); break;
case BR2CASE(SBTX,CEQ ): TYPEDPAIRS(SB,x==y); break;
case BR2CASE(SBTX,CLT ): TYPEDPAIRS(SB,SBLT(x,y)); break;
case BR2CASE(SBTX,CLE ): TYPEDPAIRS(SB,SBLE(x,y)); break;
case BR2CASE(SBTX,CGT ): TYPEDPAIRS(SB,SBGT(x,y)); break;
case BR2CASE(SBTX,CGE ): TYPEDPAIRS(SB,SBGE(x,y)); break;
case BR2CASE(SBTX,CNE ): TYPEDPAIRS(SB,x!=y); break;
case BR2CASE(INTX,CEQ ): TYPEDPAIRS(I,x==y); break;
case BR2CASE(INTX,CLT ): TYPEDPAIRS(I,x<y); break;
case BR2CASE(INTX,CLE ): TYPEDPAIRS(I,x<=y); break;
case BR2CASE(INTX,CGT ): TYPEDPAIRS(I,x>y); break;
case BR2CASE(INTX,CGE ): TYPEDPAIRS(I,x>=y); break;
case BR2CASE(INTX,CNE ): TYPEDPAIRS(I,x!=y); break;
case BR2CASE(FLX, CEQ ): TYPEDPAIRS(D,TEQ(x,y)); break;
case BR2CASE(FLX, CLT ): TYPEDPAIRS(D,TLT(x,y)); break;
case BR2CASE(FLX, CLE ): TYPEDPAIRS(D,TLE(x,y)); break;
case BR2CASE(FLX, CGT ): TYPEDPAIRS(D,TGT(x,y)); break;
case BR2CASE(FLX, CGE ): TYPEDPAIRS(D,TGE(x,y)); break;
case BR2CASE(FLX, CNE ): TYPEDPAIRS(D,TNE(x,y)); break;
default: ASSERTSYS(0,"reduce2");
}
R z;
} /* f/"r for dense w over an axis of length 2; boolean results only */
#define TW0(x,y) ((BR2CASE(x,y)>>LGBW)==0?1LL<<(BR2CASE(x,y)&(BW-1)):0)
#define TW1(x,y) ((BR2CASE(x,y)>>LGBW)==1?1LL<<(BR2CASE(x,y)&(BW-1)):0)
#define TW2(x,y) ((BR2CASE(x,y)>>LGBW)==2?1LL<<(BR2CASE(x,y)&(BW-1)):0)
#define TW3(x,y) ((BR2CASE(x,y)>>LGBW)==3?1LL<<(BR2CASE(x,y)&(BW-1)):0)
#define TWV0 TW0(B01X,CEQ)+TW0(B01X,CNE)+TW0(B01X,CLT)+TW0(B01X,CLE)+TW0(B01X,CGT)+TW0(B01X,CGE)+TW0(B01X,CMAX)+TW0(B01X,CPLUSDOT)+TW0(B01X,CPLUSCO)+TW0(B01X,CMIN)+TW0(B01X,CSTAR)+TW0(B01X,CSTARDOT)+ \
TW0(B01X,CSTARCO)+TW0(LITX,CEQ)+TW0(LITX,CNE)+TW0(C2TX,CEQ)+TW0(C2TX,CNE)+TW0(C4TX,CEQ)+TW0(C4TX,CNE)+TW0(SBTX,CEQ)+TW0(SBTX,CLT)+TW0(SBTX,CLE)+TW0(SBTX,CGT)+TW0(SBTX,CGE)+TW0(SBTX,CNE)+ \
TW0(INTX,CEQ)+TW0(INTX,CLT)+TW0(INTX,CLE)+TW0(INTX,CGT)+TW0(INTX,CGE)+TW0(INTX,CNE)+TW0(FLX, CEQ)+TW0(FLX, CLT)+TW0(FLX, CLE)+TW0(FLX, CGT)+TW0(FLX, CGE)+TW0(FLX, CNE)
#define TWV1 TW1(B01X,CEQ)+TW1(B01X,CNE)+TW1(B01X,CLT)+TW1(B01X,CLE)+TW1(B01X,CGT)+TW1(B01X,CGE)+TW1(B01X,CMAX)+TW1(B01X,CPLUSDOT)+TW1(B01X,CPLUSCO)+TW1(B01X,CMIN)+TW1(B01X,CSTAR)+TW1(B01X,CSTARDOT)+ \
TW1(B01X,CSTARCO)+TW1(LITX,CEQ)+TW1(LITX,CNE)+TW1(C2TX,CEQ)+TW1(C2TX,CNE)+TW1(C4TX,CEQ)+TW1(C4TX,CNE)+TW1(SBTX,CEQ)+TW1(SBTX,CLT)+TW1(SBTX,CLE)+TW1(SBTX,CGT)+TW1(SBTX,CGE)+TW1(SBTX,CNE)+ \
TW1(INTX,CEQ)+TW1(INTX,CLT)+TW1(INTX,CLE)+TW1(INTX,CGT)+TW1(INTX,CGE)+TW1(INTX,CNE)+TW1(FLX, CEQ)+TW1(FLX, CLT)+TW1(FLX, CLE)+TW1(FLX, CGT)+TW1(FLX, CGE)+TW1(FLX, CNE)
#if !SY_64
#define TWV2 TW2(B01X,CEQ)+TW2(B01X,CNE)+TW2(B01X,CLT)+TW2(B01X,CLE)+TW2(B01X,CGT)+TW2(B01X,CGE)+TW2(B01X,CMAX)+TW2(B01X,CPLUSDOT)+TW2(B01X,CPLUSCO)+TW2(B01X,CMIN)+TW2(B01X,CSTAR)+TW2(B01X,CSTARDOT)+ \
TW2(B01X,CSTARCO)+TW2(LITX,CEQ)+TW2(LITX,CNE)+TW2(C2TX,CEQ)+TW2(C2TX,CNE)+TW2(C4TX,CEQ)+TW2(C4TX,CNE)+TW2(SBTX,CEQ)+TW2(SBTX,CLT)+TW2(SBTX,CLE)+TW2(SBTX,CGT)+TW2(SBTX,CGE)+TW2(SBTX,CNE)+ \
TW2(INTX,CEQ)+TW2(INTX,CLT)+TW2(INTX,CLE)+TW2(INTX,CGT)+TW2(INTX,CGE)+TW2(INTX,CNE)+TW2(FLX, CEQ)+TW2(FLX, CLT)+TW2(FLX, CLE)+TW2(FLX, CGT)+TW2(FLX, CGE)+TW2(FLX, CNE)
#define TWV3 TW3(B01X,CEQ)+TW3(B01X,CNE)+TW3(B01X,CLT)+TW3(B01X,CLE)+TW3(B01X,CGT)+TW3(B01X,CGE)+TW3(B01X,CMAX)+TW3(B01X,CPLUSDOT)+TW3(B01X,CPLUSCO)+TW3(B01X,CMIN)+TW3(B01X,CSTAR)+TW3(B01X,CSTARDOT)+ \
TW3(B01X,CSTARCO)+TW3(LITX,CEQ)+TW3(LITX,CNE)+TW3(C2TX,CEQ)+TW3(C2TX,CNE)+TW3(C4TX,CEQ)+TW3(C4TX,CNE)+TW3(SBTX,CEQ)+TW3(SBTX,CLT)+TW3(SBTX,CLE)+TW3(SBTX,CGT)+TW3(SBTX,CGE)+TW3(SBTX,CNE)+ \
TW3(INTX,CEQ)+TW3(INTX,CLT)+TW3(INTX,CLE)+TW3(INTX,CGT)+TW3(INTX,CGE)+TW3(INTX,CNE)+TW3(FLX, CEQ)+TW3(FLX, CLT)+TW3(FLX, CLE)+TW3(FLX, CGT)+TW3(FLX, CGE)+TW3(FLX, CNE)
#endif
static DF1(jtreduce){A z;I d,f,m,n,r,t,wr,*ws,zt;
F1PREFIP;ARGCHK1(w);
if(unlikely(ISSPARSE(AT(w))))R reducesp(w,self); // If sparse, go handle it
wr=AR(w); ws=AS(w);
// Create r: the effective rank; f: length of frame; n: # items in a CELL of w
r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; SETICFR(w,f,r,n); // no RESETRANK
// Handle the special cases: neutrals, single items, lists of length 2
I wt=AT(w); wt=AN(w)?wt:B01; // Treat empty as Boolean type
if(unlikely(n<=2)){
if(unlikely(n==1))R head(w); // 1 item: the result is the item. Rank is still set
if(unlikely(n==0))R red0(w,self); // 0 item: return a neutral
if(unlikely(r==1))if(likely(wt&B01+LIT+INT+FL+SBT+C2T+C4T)){ // 2 items: special processing only if the operation is on rank 1: then we avoid loop overheads
C id=FAV(FAV(self)->fgh[0])->id;
if(unlikely(BETWEENC(id,CSTARCO,CMAX))){ // only boolean results are supported
I cv=BR2CASE(CTTZ(wt),id); UI cwd=TWV0; cwd=(cv>>LGBW)==1?TWV1:cwd; // figure the case, and see if it is one of those in the big macros above
#if !SY_64
cwd=(cv>>LGBW)==2?TWV2:cwd; cwd=(cv>>LGBW)==3?TWV3:cwd;
#endif
if(likely(((1LL<<(cv&(BW-1)))&cwd)!=0))R jtreduce2(jt,w,cv,f);
}
} // fall through for 2 items that can't be handled specially
}
// Normal case, >1 item.
// The case of empty w is interesting, because the #cells, and the #atoms in an item of a cell, may both overflow if
// n=0. But we have handled that case above. If n is not 0, there may be other zeros in the shape that allow
// an overflow when an infix of the shape is multiplied; but that won't matter because the other 0 will guarantee that there
// are no atoms written
// Normal processing for multiple items. Get the routine & flags to process it
VARPS adocv; varps(adocv,self,wt,0);
// If there is no special routine, go perform general reduce
if(!adocv.f)R redg(w,self); // jt->ranks is still set. redg will clear the ranks
// Here for primitive reduce handled by special code.
// Calculate m: #cells of w to operate on; d: #atoms in an item of a cell of w (a cell to which u is applied);
// zn: #atoms in result
PROD(d,r-1,f+ws+1); // */ }. $ cell
// m=*/ frame (i. e. #cells to operate on)
// r cannot be 0 (would be handled above). Calculate low part of zn first
PROD(m,f,ws);
RESETRANK; // clear rank now that we've used it - not really required here?
// Allocate the result area
zt=rtype(adocv.cv); zt=(adocv.cv&VRESMSK)==0?wt:zt; // Use specified type if given, otherwise use type of argument
GA(z,zt,m*d,MAX(0,wr-1),ws); if(1<r)MCISH(f+AS(z),f+1+ws,r-1); // allocate, and install shape
if(m*d==0){RETF(z);} // mustn't call the function on an empty argument!
// Convert inputs if needed
if((t=atype(adocv.cv))&&TYPESNE(t,wt))RZ(w=cvt(t,w));
// call the selected reduce routine.
I rc=((AHDRRFN*)adocv.f)(d,n,m,AV(w),AV(z),jt);
// if return is EWOV, it's an integer overflow and we must restart, after restoring the ranks
// EWOV1 means that there was an overflow on a single result, which was calculated accurately and stored as a D. So in that case all we
// have to do is change the type of the result.
if(unlikely((255&~EVNOCONV)&rc)){if(unlikely(rc==EVNOCONV))R z; if(jt->jerr==EWOV1){AT(z)=FL;RETF(z);}else {jsignal(rc); RETF(rc>=EWOV?IRS1(w,self,r,jtreduce,z):0);}} else {RETF((adocv.cv&VRI+VRD)&&rc!=EVNOCONV?cvz(adocv.cv,z):z);}
} /* f/"r w main control */
static A jtredcatsp(J jt,A w,A z,I r){A a,q,x,y;B*b;I c,d,e,f,j,k,m,n,n1,p,*u,*v,wr,*ws,xr;P*wp,*zp;
ws=AS(w); wr=AR(w); f=wr-r; p=ws[1+f];
wp=PAV(w); x=SPA(wp,x); y=SPA(wp,i); a=SPA(wp,a); v=AV(a);
m=AS(y)[0]; n=AN(a); n1=n-1; xr=AR(x);
RZ(b=bfi(wr,a,1));
c=b[f]; d=b[1+f]; if(c&&d)b[f]=0; e=f+!c;
j=0; DO(n, if(e==v[i]){j=i; break;});
k=1; DO(f, if(!b[i])++k;);
zp=PAV(z); SPB(zp,e,ca(SPA(wp,e)));
GATV0(q,INT,n-(I )(c&&d),1); v=AV(q); DO(wr, if(b[i])*v++=i-(I )(i>f);); SPB(zp,a,q);
if(c&&d){ /* sparse sparse */
SPB(zp,x,ca(x));
SPB(zp,i,repeatr(ne(a,sc(f)),y)); q=SPA(zp,i); // allow for virtualization of SPB
v=j+AV(q); u=j+AV(y);
DQ(m, *v=p*u[0]+u[1]; v+=n1; u+=n;);
}else if(!c&&!d){ /* dense dense */
u=AS(x); GA(q,AT(x),AN(x),xr-1,u); v=k+AS(q); *v=u[k]*u[1+k]; MCISH(1+v,2+k+u,xr-k-2);
MC(AV(q),AV(x),AN(x)<<bplg(AT(x)));
SPB(zp,x,q); SPB(zp,i,ca(y));
}else{ /* other */
GATV0(q,INT,xr,1); v=AV(q);
if(1!=k){*v++=0; *v++=k; e=0; DQ(xr-1, ++e; if(e!=k)*v++=e;); RZ(x=cant2(q,x));}
v=AV(q); u=AS(x); *v=u[0]*u[1]; MCISH(1+v,2+u,xr-1); RZ(x=reshape(vec(INT,xr-1,v),x));
e=ws[f+c]; RZ(y=repeat(sc(e),y)); RZ(y=mkwris(y)); v=j+AV(y);
if(c)DO(m, k=p**v; DO(e, *v=k+ i; v+=n;);)
else DO(m, k= *v; DO(e, *v=k+p*i; v+=n;););
RZ(q=grade1(y)); RZ(y=from(q,y)); RZ(x=from(q,x));
SPB(zp,x,x); SPB(zp,i,y);
}
R z;
} /* ,/"r w for sparse w, 2<r */
// ,&.:(<"r) run together all axes above the last r. r must not exceed AR(w)-1
// w must not be sparse or empty
A jtredcatcell(J jt,A w,I r){A z;
F1PREFIP;ARGCHK1(w);
I wr=AR(w); // get original rank, which may change if we inplace into the same block
if(r>=wr-1)R RETARG(w); // if only 1 axis left to run together, return the input
if((ASGNINPLACESGN(SGNIF(jtinplace,JTINPLACEWX)&(-r),w) && !(AFLAG(w)&AFUNINCORPABLE))){ // inplace allowed, usecount is right
// operation is loosely inplaceable. Just shorten the shape to frame,(#atoms in cell). We do this here rather than relying on
// the self-virtual-block code in virtual() because we can do it for indirect types also, since we know we are not changing
// the number of atoms
z=w; AR(z)=(RANKT)(r+1); // inplace it, install new rank
}else{
RZ(z=virtual(w,0,r+1)); AN(z)=AN(w); // create virtual block
}
I m; PROD(m,wr-r,AS(w)); // # cells being strung together
AS(z)[0]=m; MCISH(AS(z)+1,AS(w)+wr-r,r); // install # cells and then shift the last r axes back
R z;
}
DF1(jtredcat){A z;B b;I f,r,*s,*v,wr;
F1PREFIP;ARGCHK1(w);
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; s=AS(w); RESETRANK;
b=1==r&&1==s[f]; // special case: ,/ on last axis which has length 1: in that case, the rules say the axis disappears (because of the way ,/ works on length-1 lists)
if(2>r&&!b)RCA(w); // in all OTHER cases, result=input for ranks<2
// use virtual block (possibly self-virtual) for all cases except sparse
if(likely(!ISSPARSE(AT(w)))){
RZ(z=jtvirtual(jtinplace,w,0,wr-1)); AN(z)=AN(w); // Allocate the block. Then move in AN and shape
I *zs=AS(z); MCISH(zs,s,f); if(!b){DPMULDE(s[f],s[f+1],zs[f]); MCISH(zs+f+1,s+f+2,r-2);}
R z;
}else{
GASPARSE(z,AT(w),AN(w),wr-1,s);
if(!b){v=f+AS(z); DPMULDE(s[f],s[1+f],*v); MCISH(1+v,2+f+s,r-2);}
R redcatsp(w,z,r);
}
} /* ,/"r w */
static DF1(jtredsemi){I f,n,r,wr;
ARGCHK1(w);
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; SETICFR(w,f,r,n); // let the rank run into tail n=#items in a cell of w
if(2>n){ASSERT(n!=0,EVDOMAIN); R tail(w);} // rank still set
if(BOX&AT(w))R jtredg(jt,w,self); // the old way failed because it did not mimic scalar replication; revert to the long way. ranks are still set
else{A z; R IRS1(w,0L,r-1,jtbox,z);} // unboxed, just box the cells
} /* ;/"r w */
static DF1(jtredstitch){A c,y;I f,n,r,*s,*v,wr;
ARGCHK1(w);
wr=AR(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; RESETRANK;
s=AS(w); SETICFR(w,f,r,n);
ASSERT(n!=0,EVDOMAIN);
if(1==n)R IRS1(w,0L,r,jthead,y);
if(1==r){if(2==n)R RETARG(w); A z1,z2,z3; RZ(IRS2(num(-2),w,0L,0L,1L,jtdrop,z1)); RZ(IRS2(num(-2),w,0L,0L,1L,jttake,z2)); R IRS2(z1,z2,self,1L,0L,jtover,z3);}
if(2==r)R IRS1(w,0L,2L,jtcant1,y);
RZ(c=apvwr(wr,0L,1L)); v=AV(c); v[f]=f+1; v[f+1]=f; RZ(y=cant2(c,w)); // transpose last 2 axes
if(unlikely(ISSPARSE(AT(w)))){A x;
GATV0(x,INT,f+r-1,1); v=AV(x); MCISH(v,AS(y),f+1);
DPMULDE(s[f],s[f+2],v[f+1]); MCISH(v+f+2,s+3+f,r-3);
RETF(reshape(x,y));
}else{
v=AS(y);
DPMULDE(s[f],s[f+2],v[f+1]); MCISH(v+f+2,s+3+f,r-3);
--AR(y);
RETF(y);
}} /* ,./"r w */
static DF1(jtredstiteach){A*wv,y;I n,p,r,t;
ARGCHK1(w);
n=AN(w);
if(!(2<n&&1==AR(w)&&BOX&AT(w)))R reduce(w,self);
wv=AAV(w); y=C(wv[0]); SETIC(y,p); t=AT(y);
DO(n, y=C(wv[i]); r=AR(y); if(!((((r-1)&-2)==0)&&p==SETIC(y,n)&&TYPESEQ(t,AT(y))))R reduce(w,self);); // rank 1 or 2, rows match, equal types
R box(razeh(w));
} /* ,.&.>/ w */
static DF1(jtredcateach){A*u,*v,*wv,x,*xv,z,*zv;I f,m,mn,n,r,wr,*ws,zm,zn;I n1=0,n2=0;
ARGCHK1(w);
wr=AR(w); ws=AS(w); r=(RANKT)jt->ranks; r=wr<r?wr:r; f=wr-r; RESETRANK;
SETICFR(w,f,r,n);
if(!r||1>=n)R reshape(repeat(ne(sc(f),IX(wr)),shape(w)),n?w:ds(CACE));
if(!(BOX&AT(w))){A t; RZ(t=cant2(sc(f),w)) R IRS1(t,0,1,jtbox,z);} // handle unboxed args by transposing on the given axis and then boxing lists
// bug: ,&.>/ y does scalar replication wrong
// wv=AN(w)+AAV(w); DQ(AN(w), if(AN(*--wv)&&AR(*wv)&&n1&&n2) ASSERT(0,EVNONCE); if((!AR(*wv))&&n1)n2=1; if(AN(*wv)&&1<AR(*wv))n1=1;);
zn=AN(w)/n; PROD(zm,f,ws); PROD(m,r-1,ws+f+1); mn=m*n;
GATV(z,BOX,zn,wr-1,ws); MCISH(AS(z)+f,ws+f+1,r-1);
GATV0(x,BOX,n,1); xv=AAV(x);
zv=AAV(z); wv=AAV(w);
DO(zm, u=wv; DO(m, v=u++; DO(n, xv[i]=*v; v+=m;); A Zz; RZ(Zz=raze(x)); INCORP(Zz); *zv++ = Zz;); wv+=mn;); // no need to incorp *v since it's already boxed