-
Notifications
You must be signed in to change notification settings - Fork 90
/
j.h
2706 lines (2498 loc) · 153 KB
/
j.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (c) 1990-2024, Jsoftware Inc. All rights reserved. */
/* Licensed use only. Any other use is in violation of copyright. */
/* */
/* Global Definitions */
#if defined(__clang_major__) && !defined(__clang__)
#error need workaround by define __clang__ in preprocessor macro
#endif
/* clang-cl */
#if defined(__clang__) && !defined(__GNUC__)
#define __GNUC__ 4
#undef __GNUC_MINOR__
#define __GNUC_MINOR__ 2
#undef __GNUC_PATCHLEVEL__
#define __GNUC_PATCHLEVEL__ 1
#endif
// ms vc++ defined _MSC_VER but clang-cl also defined _MSC_VER
// clang-cl doesn't emulate ms vc++ good enough
// and it breaks program logic previously guarded by _MSC_VER
// MMSC_VER means the real ms vc++ excluding clang-cl
// use MMSC_VER instead of _MSC_VER throughout JE source
#if defined(_MSC_VER) && !defined(__clang__)
#undef MMSC_VER
#define MMSC_VER
#else
#undef MMSC_VER
#endif
#if !defined(MMSC_VER)
#include <stddef.h> // offsetof
#endif
/* msvc does not define __SSE2__ */
#if !defined(__SSE2__)
#if defined(MMSC_VER)
#if (defined(_M_AMD64) || defined(_M_X64))
#define __SSE2__ 1
#elif _M_IX86_FP==2
#define __SSE2__ 1
#endif
#endif
#endif
#if defined(__EMSCRIPTEN__)
#include <emscripten/emscripten.h>
#endif
// for debugging
#define NANTEST0 (fetestexcept(FE_INVALID)) // test but does not clear
#define dump_m128i(a,x) {__m128i _b=x;fprintf(stderr,"%s %x %x %x %x \n", a, ((unsigned int*)(&_b))[0], ((unsigned int*)(&_b))[1], ((unsigned int*)(&_b))[2], ((unsigned int*)(&_b))[3]);}
#define dump_m128i64(a,x) {__m128i _b=x;fprintf(stderr,"%s %lli %lli \n", a, ((long long*)(&_b))[0], ((long long*)(&_b))[1]);}
#define dump_m256i(a,x) {__m256i _b=x;fprintf(stderr,"%s %lli %lli %lli %lli \n", a, ((long long*)(&_b))[0], ((long long*)(&_b))[1], ((long long*)(&_b))[2], ((long long*)(&_b))[3]);}
#define dump_m256i16(a,x) {__m256i _b=x;fprintf(stderr,"%s %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x \n", a, ((unsigned short*)(&_b))[0], ((unsigned short*)(&_b))[1], ((unsigned short*)(&_b))[2], ((unsigned short*)(&_b))[3], ((unsigned short*)(&_b))[4], ((unsigned short*)(&_b))[5], ((unsigned short*)(&_b))[6], ((unsigned short*)(&_b))[7],((unsigned short*)(&_b))[8], ((unsigned short*)(&_b))[9], ((unsigned short*)(&_b))[10], ((unsigned short*)(&_b))[11], ((unsigned short*)(&_b))[12], ((unsigned short*)(&_b))[13], ((unsigned short*)(&_b))[14], ((unsigned short*)(&_b))[15]);}
#define dump_m256i32(a,x) {__m256i _b=x;fprintf(stderr,"%s %x %x %x %x %x %x %x %x \n", a, ((unsigned int*)(&_b))[0], ((unsigned int*)(&_b))[1], ((unsigned int*)(&_b))[2], ((unsigned int*)(&_b))[3], ((unsigned int*)(&_b))[4], ((unsigned int*)(&_b))[5], ((unsigned int*)(&_b))[6], ((unsigned int*)(&_b))[7]);}
#define dump_m256d(a,x) {__m256d _b=x;fprintf(stderr,"%s %f %f %f %f \n", a, ((double*)(&_b))[0], ((double*)(&_b))[1], ((double*)(&_b))[2], ((double*)(&_b))[3]);}
#define dump_m128d(a,x) {__m128d _b=x;fprintf(stderr,"%s %f %f \n", a, ((double*)(&_b))[0], ((double*)(&_b))[1]);}
#ifdef MMSC_VER
#define NOINLINE __declspec(noinline)
#define INLINE __forceinline
#else
#define NOINLINE __attribute__((noinline))
#define INLINE inline __attribute__((__always_inline__))
#endif
#ifdef __MINGW32__
// original definition
// #define INLINE extern __inline__ __attribute__((__always_inline__,__gnu_inline__))
#define INLINE __inline__ __attribute__((__always_inline__,__gnu_inline__))
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
#ifndef C_AVX512
#define C_AVX512 0
#endif
#ifndef C_AVX2
#define C_AVX2 0
#endif
#if C_AVX512
#undef C_AVX2
#define C_AVX2 1
#endif
#else
#undef C_AVX512
#define C_AVX512 0
#undef C_AVX2
#define C_AVX2 0
#endif
#ifdef _WIN32
#if EMU_AVX2 || C_AVX2
#ifndef _WIN64
#error not 64-bit compiler
#endif
#endif
#endif
#if C_AVX2
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__))
#include <immintrin.h>
#endif
#endif
#if !defined(EMU_AVX2) && ((defined(__SSE2__) && defined(__x86_64__)) || defined(__aarch64__) || defined(_M_ARM64))
#undef EMU_AVX2
#define EMU_AVX2 1
#endif
// no EMU_AVX512; avx512 is not widespread yet, and older chips still downclock (so not worth it for small arrays), so still maintain avx2-specific paths
#if C_AVX2
#undef EMU_AVX2
#define EMU_AVX2 0
#elif defined(__SSE2__) && defined(__x86_64__)
#if EMU_AVX2
#include <stdint.h>
#include <string.h>
#include "avxintrin-emu.h"
//#include "avx2intrin-emu.h"
#else
#include <emmintrin.h>
#endif
#define _CMP_EQ 0
#define _CMP_LT 1
#define _CMP_LE 2
#define _CMP_UNORD 3
#define _CMP_NEQ 4
#define _CMP_NLT 5
#define _CMP_NLE 6
#define _CMP_ORD 7
#undef _CMP_EQ_OQ
#undef _CMP_GE_OQ
#undef _CMP_GT_OQ
#undef _CMP_LE_OQ
#undef _CMP_LT_OQ
#undef _CMP_NEQ_OQ
#define _CMP_EQ_OQ _CMP_EQ
#define _CMP_GE_OQ _CMP_NLT
#define _CMP_GT_OQ _CMP_NLE
#define _CMP_LE_OQ _CMP_LE
#define _CMP_LT_OQ _CMP_LT
#define _CMP_NEQ_OQ _CMP_NEQ
#endif //__SSE2__
#if defined(__aarch64__)||defined(_M_ARM64)
#if EMU_AVX2
#include <stdint.h>
#include <string.h>
#include "sse2neon.h"
#include "sse2neon2.h"
#include "avxintrin-neon.h"
#else
#include <arm_neon.h>
#endif
#endif
#if SLEEF && !defined(_CMP_EQ)
#define _CMP_EQ 0
#define _CMP_LT 1
#define _CMP_LE 2
#define _CMP_UNORD 3
#define _CMP_NEQ 4
#define _CMP_NLT 5
#define _CMP_NLE 6
#define _CMP_ORD 7
#undef _CMP_EQ_OQ
#undef _CMP_GE_OQ
#undef _CMP_GT_OQ
#undef _CMP_LE_OQ
#undef _CMP_LT_OQ
#undef _CMP_NEQ_OQ
#define _CMP_EQ_OQ _CMP_EQ
#define _CMP_GE_OQ _CMP_NLT
#define _CMP_GT_OQ _CMP_NLE
#define _CMP_LE_OQ _CMP_LE
#define _CMP_LT_OQ _CMP_LT
#define _CMP_NEQ_OQ _CMP_NEQ
#endif
#if defined(__arm__)
#if defined(__ARM_NEON)
#include <arm_neon.h>
typedef double float64x2_t __attribute__ ((vector_size (16)));
#else
#include <stdint.h>
typedef int64_t int64x2_t __attribute__ ((vector_size (16)));
typedef double float64x2_t __attribute__ ((vector_size (16)));
#endif
#endif
#undef VOIDARG
#define VOIDARG
#if C_AVX512
#if (!defined(__clang__)) && defined(__GNUC__) && __GNUC__ < 10
static __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_epi64 (void const *__P)
{
struct __loadu_epi64 {
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
return ((const struct __loadu_epi64*)__P)->__v;
}
static __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_epi64 (void *__P, __m512i __A)
{
struct __storeu_epi64 {
__m512i_u __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_epi64*)__P)->__v = __A;
}
#endif
#endif
#if defined(__AVX2__) || defined(__aarch64__) // note can't do #if x defined(y)
#define HASFMA 1 // true if architecture has hardware FMA capacity with AVX2 instructions
#else
#define HASFMA 0
#endif
#if SLEEF
#include "../sleef/include/sleef.h"
#undef SLEEFQUAD
#define SLEEFQUAD 1
#include "../sleef/include/sleefquad.h"
#elif SLEEFQUAD
#include "../sleef/include/sleefquad.h"
#endif
#if defined(_OPENMP)
#include <omp.h>
#else
typedef int omp_int_t;
static inline omp_int_t omp_get_thread_num() { return 0;}
static inline omp_int_t omp_get_max_threads() { return 1;}
static inline omp_int_t omp_get_num_threads() { return 1;}
#endif
#ifndef SYS // include js.h only once - dtoa.c
#include "js.h"
#endif
// todo look into whether windows supports this or not; I have heard support is spotty?
#if C_AVX512 && (SY_FREEBSD || SY_LINUX)
#define C_FSGSBASE 1
#else
#define C_FSGSBASE 0
#endif
// If you are porting to a new compiler or architecture, see the bottom of this file
// for instructions on defining the CTTZ macros
#if SY_WINCE
#include "..\cesrc\cecompat.h"
#endif
#if (SYS & SYS_PCWIN)
#define HEAPCHECK heapcheck()
#else
#define HEAPCHECK
#endif
#if (SYS & SYS_ATARIST)
#define __NO_INLINE__ 1
#endif
#if (SYS & SYS_UNIX - SYS_SGI)
#include <unistd.h>
#include <memory.h>
#include <sys/types.h>
#endif
// likely/unlikely support
#if defined(__clang__) || defined(__GNUC__)
#ifndef likely
#define likely(x) __builtin_expect(!!(x),1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x),0)
#endif
#if defined(_WIN32) || defined(__clang__) || __GNUC__ > 9
#if (defined(__has_builtin) && __has_builtin(__builtin_expect_with_probability)) || (!defined(__clang__) && __GNUC__ >= 9)
#define common(x) __builtin_expect_with_probability(!!(x),1,0.6)
#define uncommon(x) __builtin_expect_with_probability(!!(x),1,0.4)
#define withprob(x,p) __builtin_expect_with_probability(!!(x),1,(p))
#else
#define common(x) likely(x)
#define uncommon(x) unlikely(x)
#define withprob(x,p) (x)
#endif
#else
#define common(x) likely(x)
#define uncommon(x) unlikely(x)
#define withprob(x,p) (x)
#endif
#else
#define likely(x) (!!(x))
#define unlikely(x) (!!(x))
#define common(x) (!!(x))
#define uncommon(x) (!!(x))
#define withprob(x,p) (x)
#endif
#include <stdint.h>
#include <float.h>
#include <limits.h>
#define link unused_syscall_link
#define qdiv unused_netbsd_qdiv
#ifndef __USE_XOPEN2K
#define __USE_XOPEN2K // for posix_memalign
#endif
#include <stdlib.h>
#undef link
#undef qdiv
#if ! SY_WINCE
#include <errno.h>
#include <stdio.h>
#endif
#include <math.h>
#include <string.h>
#ifdef ANDROID
#include <android/log.h>
#define logcat_d(msg) __android_log_write(ANDROID_LOG_DEBUG,(const char*)"libj",msg)
#endif
#if defined(__APPLE__)
#include <TargetConditionals.h>
#if TARGET_OS_IPHONE||TARGET_OS_IOS||TARGET_OS_TV||TARGET_OS_WATCH||TARGET_OS_SIMULATOR||TARGET_OS_EMBEDDED||TARGET_IPHONE_SIMULATOR
#define TARGET_IOS 1
#endif
#endif
#if defined(__aarch32__)||defined(__arm__)||defined(_M_ARM)||defined(__aarch64__)||defined(_M_ARM64)
#ifndef __ARM_FEATURE_UNALIGNED
#define ALIGNEDMEM
#endif
#endif
#if SY_WIN32
#if defined(_WIN32) && !defined(OLECOM)
#define OLECOM
#endif
#endif
/* IEEE 754 constants that are not defined in float.h */
// D_MANT_BITS_N is number of bits of mantissa in bit representation.
// Its value is DBL_MANT_DIG - 1, because the first digit does not occur in bit representation (always 1 for normalized numbers).
#define D_MANT_BITS_N 52
// D_EXP_BITS_N is number of bits of exponent in bit representation.
#define D_EXP_BITS_N 11
// 1 - D_EXP_MAX <= exponent <= D_EXP_MAX. In bit representation sum of exponent and D_EXP_MAX is stored which is positive. D_EXP_MAX = DBL_MAX_EXP - 1.
#define D_EXP_MAX 1023
// D_EXP_MIN = 1 - D_EXP_MAX
#define D_EXP_MIN (-1022)
// Bit mask of exponent is (UI8)((1 << D_EXP_BITS_N) - 1) << D_MANT_BITS_N. This is also bit representation of +Inf.
#define D_EXP_MSK 0x7ff0000000000000LL
#define D_MANT_MSK 0x000fffffffffffffLL
// Bit mask of double 1 (mantissa is 0 and exponent is D_EXP_MAX).
#define D_ONE_MSK 0x3ff0000000000000LL
#if SY_64
#define IMAX 9223372036854775807LL
#define IMAXPRIME 9223372036854775783LL
#define IMIN (~9223372036854775807LL) /* ANSI C LONG_MIN is -LONG_MAX */
#define FLIMAX 9223372036854775296. // largest FL value that can be converted to I
#define FLIMIN ((D)IMIN) // smallest FL value that can be converted to I
#define FMTI "%lli"
#define FMTI02 "%02lli"
#define FMTI04 "%04lli"
#define FMTI05 "%05lli"
#if defined(MMSC_VER) // SY_WIN32
#define strtoI _strtoi64
#else
#define strtoI strtoll
#endif
#else
#define IMAX 2147483647L
#define IMAXPRIME IMAX
#define IMIN (~2147483647L) /* ANSI C LONG_MIN is -LONG_MAX */
#define FLIMAX ((D)IMAX+0.4) // largest FL value that can be converted to I
#define FLIMIN ((D)IMIN) // smallest FL value that can be converted to I
#define FMTI "%d"
#define FMTI02 "%02d"
#define FMTI04 "%04d"
#define FMTI05 "%05d"
#define strtoI strtol
#endif
#define NEGATIVE0 (UIL)0x8000000000000000LL // IEEE -0 (double precision)
#define C4MAX 0xffffffffUL
#define C4MIN 0L
#if (SYS & SYS_AMIGA)
#define XINF "\177\377\000\000\000\000\000\000"
#define XNAN "\177\361\000\000\000\000\000\000"
#endif
#if (SYS & SYS_ARCHIMEDES)
#define XINF "\000\000\360\177\000\000\000\000"
#define XNAN "\000\000\370\377\000\000\000\000"
#endif
#if (SYS & SYS_DEC5500) || SY_WINCE_SH
#define XINF "\000\000\000\000\000\000\360\177"
#define XNAN "\000\000\000\000\000\000\370\377"
#endif
#if (SYS & SYS_MACINTOSH)
/* for old versions of ThinkC */
/* #define XINF "\177\377\000\000\000\000\000\000\000\000\000\000" */
/* #define XNAN "\377\377\100\000\100\000\000\000\000\000\000\000" */
/* for ThinkC 7.0 or later */
#define XINF "\177\377\177\377\000\000\000\000\000\000\000\000"
#define XNAN "\377\377\377\377\100\000\000\000\000\000\000\000"
#endif
#if (SYS & SYS_SUN4+SYS_SUNSOL2)
#define XINF "\177\360\000\000\000\000\000\000"
#define XNAN "\177\377\377\377\377\377\377\377"
#endif
#if (SYS & SYS_VAX)
#define XINF "\377\177\377\377\377\377\377\377"
#define XNAN "\377\177\377\377\377\377\377\376" /* not right */
#endif
#if (SY_WINCE_MIPS || SY_WINCE_SH)
#if WIN32_PLATFORM_PSPC
#define XINF "\000\000\000\000\000\000\360\177"
#define XNAN "\377\377\377\377\377\377\367\177"
#else
#define XINF "\000\000\000\000\000\000\360\177"
#define XNAN "\001\000\000\000\000\000\360\177"
#endif
#endif
#if SY_WINCE_ARM
#define XINF "\000\000\000\000\000\000\360\177"
#define XNAN "\000\000\000\000\000\000\370\177"
#endif
#if C_LE
#ifndef XINF
#define XINF "\000\000\000\000\000\000\360\177"
#define XNAN "\000\000\000\000\000\000\370\377"
#endif
#endif
#ifndef XINF
#define XINF "\177\360\000\000\000\000\000\000"
#define XNAN "\177\370\000\000\000\000\000\000"
#endif
#ifndef PI
#define PI ((D)3.14159265358979323846)
#endif
#define P2 ((D)6.28318530717958647693)
#ifndef OVERFLOW
#define OVERFLOW ((D)8.988465674311578e307)
#endif
#ifndef UNDERFLOW
#define UNDERFLOW ((D)4.450147717014403e-308)
#endif
// RESTRICT causes the compiler to generate better code by assuming no overlap of regions pointed to by pointers
// We use RESTRICT for routines that operate in-place on an argument. This is strictly speaking a violation of the rule,
// but normally something like *z = *x + *y will not cause trouble because there is no reason to refetch an input after
// the result has been written. On 32-bit machines, registers are so short that sometimes the compilers refetch an input
// after writing to *z, so we don't turn RESTRICT on for 32-bit
#if defined(MMSC_VER)
// RESTRICT is an attribute of a pointer, and indicates that no other pointer points to the same area
#define RESTRICT __restrict
// RESTRICTF is an attribute of a function, and indicates that the object returned by the function is not aliased with any other object
#define RESTRICTF __declspec(restrict)
#define PREFETCH(x) _mm_prefetch((x),_MM_HINT_T0)
#define PREFETCH2(x) _mm_prefetch((x),_MM_HINT_T1) // prefetch into L2 cache but not L1
#elif defined(__GNUC__)
#define RESTRICT __restrict
#define RESTRICTF __attribute__((malloc))
#define PREFETCH(x) __builtin_prefetch(x)
#define PREFETCH2(x) __builtin_prefetch((x),0,2) // prefetch into L2 cache but not L1
#else
#define RESTRICT
#define RESTRICTF
#define PREFETCH(x)
#define PREFETCH2(x)
#endif
#ifdef __MINGW32__
#ifndef _SW_INVALID
#define _SW_INVALID 0x00000010 /* invalid */
#endif
#ifndef _EM_ZERODIVIDE
#define _EM_ZERODIVIDE 0x00000008
#endif
#define EM_INVALID _SW_INVALID
#define EM_ZERODIVIDE _EM_ZERODIVIDE
#if defined(__STRICT_ANSI__)
extern int __cdecl _isnan (double);
extern unsigned int __cdecl _clearfp (void);
#endif
#ifndef _MAX_PATH
#ifdef PATH_MAX
#define _MAX_PATH PATH_MAX
#else
#define _MAX_PATH (260)
#endif
#endif
#endif
#if SY_WIN32
struct jtimespec { long long tv_sec, tv_nsec; };
struct jtimeval { long long tv_sec, tv_usec; };
struct jtimezone { int tz_minuteswest, tz_dsttime; };
int jgettimeofday(struct jtimeval*, struct jtimezone*);
#else
#include <sys/time.h>
#include <time.h>
#define jtimespec timespec
#define jtimeval timeval
#define jtimezone timezone
#define jgettimeofday gettimeofday
#endif
struct jtimespec jmtclk(void); //monotonic clock. Intended rel->abs conversions when sleeping; has poor granularity and slow on windows
struct jtimespec jmtfclk(void); //'fast clock'; maybe less inaccurate; intended for timed busywaiting
#if SY_64
#if defined(MMSC_VER) // SY_WIN32
// RESTRICTI (for in-place) is used for things like *z++=*x++ - *y++; Normally you wouldn't store to a z unless you were done reading
// the x and y, so it would be safe to get the faster loop that RESTRICT generates, even though strictly speaking if x or y is the
// same address as z the terms of the RESTRICT are violated. But on 32-bit machines, registers are so tight that sometimes *z is used
// as a temp, which means we can't take the liberties there
#define RESTRICTI // __restrict don't take chances
#endif
#ifdef __GNUC__
#define RESTRICTI // __restrict don't take chances
#endif
#endif // SY_64
#ifndef RESTRICT
#define RESTRICT
#endif
#ifndef RESTRICTF
#define RESTRICTF
#endif
#ifndef RESTRICTI
#define RESTRICTI
#endif
// If PREFETCH is not defined, we won't generate prefetch instrs
// If the user switch C_NOMULTINTRINSIC is defined, suppress using it
#ifdef C_NOMULTINTRINSIC
#define C_USEMULTINTRINSIC 0
#else
#define C_USEMULTINTRINSIC 1
#endif
// disable C_USEMULTINTRINSIC if un-available
#if C_USEMULTINTRINSIC
#if !defined(MMSC_VER)
#if defined(__clang__)
#if !__has_builtin(__builtin_smul_overflow)
#undef C_USEMULTINTRINSIC
#define C_USEMULTINTRINSIC 0
#endif
#elif __GNUC__ < 5
#undef C_USEMULTINTRINSIC
#define C_USEMULTINTRINSIC 0
#endif
#endif
#endif
#if !SY_64 && defined(__GNUC__) && !defined(__clang__)
#if __GNUC__ < 5
#define __builtin_add_overflow(a,b,c) ({int64_t s=(int64_t)(a)+(int64_t)(b); *(c)=(long)s; (s<INT_MIN||s>INT_MAX);})
#define __builtin_sub_overflow(a,b,c) ({int64_t s=(int64_t)(a)-(int64_t)(b); *(c)=(long)s; (s<INT_MIN||s>INT_MAX);})
#define __builtin_mul_overflow(a,b,c) ({int64_t s=(int64_t)(a)*(int64_t)(b); *(c)=(long)s; (s<INT_MIN||s>INT_MAX);})
#endif
#endif
#if defined(__clang__) && ( (__clang_major__ > 3) || ((__clang_major__ == 3) && (__clang_minor__ > 5)))
/* needed by clang newer versions, no matter double_trick is inline asm or not */
#define NOOPTIMIZE __attribute__((optnone))
#elif __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ > 3))
#define NOOPTIMIZE __attribute__((optimize("O0")))
#else
#define NOOPTIMIZE
#endif
#define NALP 256 /* size of alphabet */
#define NETX 2000 /* size of error display buffer */
#define NPP 40 /* max value for quad pp */
#define NPATH 1024 /* max length for path names, */
/* including trailing 0 byte */
// Now we are trying to watch the C stack directly
// The named-call stack is used only when there is a locative, EXCEPT that after a call to 18!:4 it is used until the function calling 18!:4 returns.
// Since startup calls 18!:4 without a name, we have to allow for the possibility of deep recursion in the name stack. Normally only a little of the stack is used
#if defined(CSTACKSIZE)
#if !defined(CSTACKRESERVE)
#error CSTACKSIZE and CSTACKRESERVE must be defined together
#endif
#else
#if defined(_WIN32)
#define CSTACKSIZE (SY_64?12009472:1015808) // size we allocate in the calling function, aligned to 16k system page size 9961472 for 10MB
#else
#if (defined(ANDROID) && !defined(__LP64__)) || (defined(__OpenBSD__) && defined(__aarch64__))
#define CSTACKSIZE (SY_64?4194304:1015808) // OS default stack size 4MB, aligned to 16k system page size
#else
#define CSTACKSIZE (SY_64?7946240:1015808) // OS default stack size 8MB, aligned to 16k system page size
#endif
#endif
#define CSTACKRESERVE 100000 // amount we allow for slop before we sample the stackpointer, and after the last check
#endif
//The named-function stack is intelligent
// and stacks only when there is a locale change or deletion; it almost never limits unless locatives are used to an extreme degree.
// The depth of the C stack will normally limit stack use.
#define NFCALL (1000L) // call depth for named calls, not important. Must fit into an S
// start and length for the stored vector of ascending integers
#define IOTAVECBEGIN (-20)
#define IOTAVECLEN 400 // must be >= 256 so all memsets can be sourced from here
// modes for indexofsub()
#define IIOPMSKX 5 // # bits of flags
#define IIOPMSK (((I)1<<IIOPMSKX)-1) // operation bits. INTER also uses bit 3, which is included as a modifier in the switches
#define IIOPMSKINIT 0xf //
#define IIDOT 0 // IIDOT and IICO must be 0-1
#define IICO 1
#define INUBSV 2 // BIT arrays INUBSV-INUBI init to 1 to that out-of-bounds in LESS keeps the value
#define INUB 3
#define ILESS 4
#define INUBI 5
#define IEPS 6 // BIT arrays IEPS and above init to 0 so out-of-bounds means not included
// the I...EPS values below are wired into the function table at the end of vcompsc.c, where they are combined with a comparison
#define II0EPS 7 // i.&0@:e. this must come first; others base on it
#define II1EPS 8 // i.&1@:e.
#define IJ0EPS 9 // i:&0@:e.
#define IJ1EPS 10 // i:&1@:e.
#define ISUMEPS 11 // +/@:e.
#define IANYEPS 12 // +./@:e.
#define IALLEPS 13 // *./@:e.
#define IIFBEPS 14 // I.@e.
#define IFORKEY 15 // special key support: like i.~, but add # values mapped to the index, and return #unique values in AM
#define IINTER 16 // ([ -. -.)
#define IIMODFIELD ((I)7<<IIOPMSKX) // bits used to indicate processing options
#define IIMODPACKX 5
#define IIMODPACK (((I)1)<<IIMODPACKX) // modifier for type. (small-range search except i./i:) In IIDOT/IICO, indicates reflexive application. In others, indicates that the
// bitmask should be stored as packed bits rather than bytes
#define IIMODREFLEXX 5 // overlaps IIMODPACK; OK because reflexive i./i: needs to know where the match was & can't use bitmask
#define IIMODREFLEX (((I)1)<<IIMODREFLEXX) // (small-range i. and i:) this is i.~/i:~ (hashing) this is i.~/i:~/~./~:/I.@:~.
#define IIMODFULLX 6
#define IIMODFULL (((I)1)<<IIMODFULLX) // (small-range search) indicates that the min/max values cover the entire range of possible inputs, so no range checking is required. Always set for hashing
#define IIMODBASE0X 7
#define IIMODBASE0 (((I)1)<<IIMODBASE0X) // set in small-range i./i: (which never use BITS) to indicate that the hashtable starts at index 0 and has m in the place of unused indexes. Set in hashing always, with same meaning
#define IIMODBITSX 8
#define IIMODBITS (((I)1)<<IIMODBITSX) // set if the hash field stores bits rather than indexes. Used only for small-range and not i./i:. IIMODPACK qualifies this, indicating that the bits are packed
#define IIMODFORCE0X 9
#define IIMODFORCE0 (((I)1)<<IIMODFORCE0X) // set to REQUIRE a (non-bit) allocation to reset to offset 0 and clear
#define IPHCALCX 10
#define IPHCALC (((I)1)<<IPHCALCX) // set when we are calculating a prehashed table
#define IINOTALLOCATEDX 11
#define IINOTALLOCATED (((I)1)<<IINOTALLOCATEDX) // internal flag, set when the block has not been allocated
#define IIOREPSX 12
#define IIOREPS (((I)1)<<IIOREPSX) // internal flag, set if mode is i./i:/e./key, but not if prehashing
#define IREVERSEDX 13
#define IREVERSED (((I)1)<<IREVERSEDX) // set if we have decided to reverse the hash in a small-range situation
#define IPHOFFSETX 14
#define IPHOFFSET (((I)1)<<IPHOFFSETX) /* offset for prehashed versions - set when we are using a prehashed table */
#define IPHIDOT (IPHOFFSET+IIDOT)
#define IPHICO (IPHOFFSET+IICO)
#define IPHEPS (IPHOFFSET+IEPS)
#define IPHI0EPS (IPHOFFSET+II0EPS)
#define IPHI1EPS (IPHOFFSET+II1EPS)
#define IPHJ0EPS (IPHOFFSET+IJ0EPS)
#define IPHJ1EPS (IPHOFFSET+IJ1EPS)
#define IPHSUMEPS (IPHOFFSET+ISUMEPS)
#define IPHANYEPS (IPHOFFSET+IANYEPS)
#define IPHALLEPS (IPHOFFSET+IALLEPS)
#define IPHIFBEPS (IPHOFFSET+IIFBEPS)
#define IPHINTER (IPHOFFSET+IINTER)
#define ISFUX 15
#define ISFU (((I)1)<<ISFUX) // i.!.1 - sequential file update
#if C_AVX2 // _mm_round_pd requires sse4.1, mm256 needs avx
#define jceil(x) _mm256_cvtsd_f64(_mm256_round_pd(_mm256_set1_pd(x),(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC))) // ugly but clang understands
#define jfloor(x) _mm256_cvtsd_f64(_mm256_round_pd(_mm256_set1_pd(x),(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)))
#define jround(x) _mm256_cvtsd_f64(_mm256_round_pd(_mm256_set1_pd(x),(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)))
#else
#define jceil(x) ceil(x)
#define jfloor(x) floor(x)
#define jround(x) floor(0.5+(x)) // for paranoid compatibility with earlier versions
#endif
#define BB 8 /* # bits in a byte */
#define LGBB 3 // lg(BB)
#if SY_64
#define BW 64 /* # bits in a word */
#define LGSZI 3 // lg(#bytes in an I)
#else
#define BW 32
#define LGSZI 2
#endif
#define LGBW (LGSZI+LGBB) // lg (# bits in a word)
// nominal cache sizes for current processors
#define L1CACHESIZE (((I)1)<<15) // 32k
#define L2CACHESIZE (((I)1)<<20) // 1m
#define L3CACHESIZE (((I)1)<<22) // 4m
#define TOOMANYATOMSX 47 // more atoms than this is considered overflow (64-bit). i.-family can't handle more than 2G cells in array.
#define MINVIRTSIZE 32 // must have this many atoms to be virtual. This is just a suggestion, and not honoroed everywhere
// Whether we should do so is a tricky question. Surely, if the argument is big, since we may save a large indexed copy.
// If the argument is small, the virtual is still better if it doesn't have to be realized; but it might be
// realized in effect if it is unavailable for inplacing. OTOH, if the argument is indirect the virtual does
// not require individual usecounting of the atoms.
//
// It would be good if we could know if the result is going to be assigned, perhaps jt->zombieval=1. We could
// suppress the virtual then.
// Debugging options
// Use MEMAUDIT to sniff out errant memory alloc/free
#ifndef MEMAUDIT
#define MEMAUDIT 0x00 // Bitmask for memory audits:
// 1: make sure chains are valid (check headers)
// 2: full audit of tpush/tpop
// detect double-frees before they happen,
// at the time of the erroneous tpush
// 4: write garbage to memory before we free it (except reserved area)
// 8: fill block with other garbage after we allocate it
// 0x10: (or 16) audit freelist at every alloc/free
// (starting after you have run 6!:5 (1) to turn it on)
// 0x20: audit freelist at end of every sentence regardless of 6!:5
// 0x40: enable guard blocks (libgmp mallocs only)
//
// Thus 1+4+8 (or 13 or 0xD) will verify that there are no blocks
// being used after they are freed, or freed prematurely. If you
// get a wild free, turn on bit 0x2. 2 will detect double-frees
// before they happen, at the time of the erroneous tpush
#endif
#define MEMAUDITPCALLENABLE 1 // expression for enabling stack auditing - enable auditing when true and enabled by MEMAUDIT&0x20 || jt->peekdata
#ifndef AUDITEXECRESULTS
#define AUDITEXECRESULTS 0 // When set, we go through all execution results to verify recursive and virtual bits are OK, and m nonzero if AC<0
#endif
#ifndef FORCEVIRTUALINPUTS
#define FORCEVIRTUALINPUTS 0 // When 1 set, we make all non-inplaceable noun inputs to executions VIRTUAL. Tests should still run
#endif
// When 2 set, make all outputs from RETF() virtual. Tests for inplacing will fail; that's OK if nothing crashes
#ifndef NAMETRACK
#define NAMETRACK 0 // turn on to define trackinfo in unquote, xdefn, line
#endif
// set FINDNULLRET to trap when a routine returns 0 without having set an error message
#ifndef FINDNULLRET
#define FINDNULLRET 0
#endif
#ifndef CRASHLOG
#define CRASHLOG 0 // set to allow writing to crashlog
#endif
#ifndef MEMHISTO
#define MEMHISTO 0 // set to create a histogram of memory requests, interrogated by 9!:54/9!:55
#endif
#define ANASARGEEMENT 0 // set to check whether or not AN() is equal to */AS()
#define MAXTHREADS 63 // maximum number of tasks running at once, including the master thread. System lock polls every thread, allocated or not, which is the only real limit on size. Unactivated
// threads will be paged out.
#define MAXTHREADSRND 64 // MAXTHREADS+1, rounded up to power-of-2 bdy to get the the JST block aligned on a multiple of its size. The JTT blocks come after the JTT block, which has the same size
#if MAXTHREADS>255
#define WLOCKBIT 0x8000 // the LSB of the part of a 16-bit lock used for write locks.
#else
#define WLOCKBIT 0x100 // With <256 threads, we split the lock into 2 8-bit sections so we can use LOCK XADD instructions
#endif
#define MAXTHREADPOOLS 8 // max # thread pools supported
#define MAXTHREADSINPOOL 63 // Max threads in a single pool. The low bits of the task pointer are used as a lock, so that code will have to be rewritten if MAXTHREADSINPOOL>63 (which is the allocation boundary for the
// task block). As of now, this limit is immaterial, because every thread in the system might choose to start a job on a single threadpool, which limits the total number of threads to 63. But
// we could force job/task/thread creators to serialize on a lock, which would limit the number of waits from outside the pool to 1, and then we could have more threads total as long as
// the number in a single pool is limited
// tpop stack is allocated in units of NTSTACK, but processed in units of NTSTACKBLOCK on an NTSTACKBLOCK boundary to reduce waste in each allocation.
// If we audit execution results, we use a huge allocation so that tpop pointers can be guaranteed never to need a second one, & will thus be ordered
#define NTSTACK (1LL<<(AUDITEXECRESULTS?24:14)) // number of BYTES in an allocated block of tstack - pointers to allocated blocks - allocation is bigger to leave this many bytes on boundary
#define NTSTACKBLOCK 2048 // boundary for beginning of stack block
#define CWMAX 32766 // max # control words in an explicit defn. Must fit in signed 15-bit value because we complement it in storage
#define SWMAX 32767 // max # words in a sentence
#define EXPWMAX 16777215 // max # words in an explicit defn
// flags for jteformat
#define EMSGE 0xff // the error-code part
#define EMSGNOEVM 0x200 // set to suppress moving the terse message
#define EMSGLINEISA 0x400 // line contains A block for message (otherwise it points to string if any and info has the length of the string)
#define EMSGCXINFO 0x800 // info contains line#/col# of error
#define EMSGSPACEAFTEREVM 0x1000 // set if terse message should be followed by a space
#define EMSGLINEISTERSE 0x2000 // set if line has the text for the terse message (13!:8)
#define EMSGLINEISNAME 0x4000 // set if line has the name to use in place of jt->curname
#define EMSGFROMPYX 0x8000 // set if this error is being copied from a pyx (it can't be analyzed, and it should be marked specially
#define EMSGNOEFORMAT 0x10000 // set if this error should not be passed to eformat for processing
#define EMSGINVCHAR 0x20000 // set to append 'invalid char' to msg
#define EMSGINVINFL 0x40000 // set to append 'invalid inflection' to msg
#define EMSGNOMSGLINE 0x80000 // set to append 'invalid inflection' to msg
#ifndef PYXES
#define PYXES 1
#endif
#if !SY_64
#undef PYXES
#define PYXES 0
#endif
// if we are not multithreading, report the master thread only
#if !PYXES
#undef MAXTHREADS
#define MAXTHREADS 1 // override to no tasks if no pyxes
#endif
#if defined(ANDROID) && defined(__x86_64__)
#undef MAXTHREADS
#define MAXTHREADS 1 // workaround for android x86_64
#endif
#if PYXES
#define REPATGCLIM 0x100000 // When this many bytes have been repatriated to a thread, call a GC in that thread
#define REPATOLIM (REPATGCLIM/32) // When an outgoing repatriation queue contains this many bytes, flush it
#else
// if we are not multithreading, we replace the atomic operations with non-atomic versions
#define __atomic_store_n(aptr,val, memorder) (*aptr=val)
#define __atomic_load_n(aptr, memorder) *aptr
#if defined(__clang__) || __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ > 8))
#define __atomic_compare_exchange_n(aptr, aexpected, desired, weak, success_memorder, failure_memorder) (*aptr=desired,1)
#define __atomic_exchange_n(aptr, val, memorder) ({__auto_type rrres=*aptr; *aptr =val; rrres;})
#define __atomic_fetch_or(aptr, val, memorder) ({__auto_type rrres=*aptr; *aptr|=val; rrres;})
#define __atomic_fetch_sub(aptr, val, memorder) ({__auto_type rrres=*aptr; *aptr-=val; rrres;})
#define __atomic_fetch_add(aptr, val, memorder) ({__auto_type rrres=*aptr; *aptr+=val; rrres;})
#define __atomic_fetch_and(aptr, val, memorder) ({__auto_type rrres=*aptr; *aptr&=val; rrres;})
#else
#define __atomic_compare_exchange_n(aptr, aexpected, desired, weak, success_memorder, failure_memorder) (*aptr=desired,1)
#define __atomic_exchange_n(aptr, val, memorder) ({I rrres=(intptr_t)*aptr; *aptr=val; rrres;})
#define __atomic_fetch_or(aptr, val, memorder) ({I rrres=(intptr_t)*aptr; *aptr|=val; rrres;})
#define __atomic_fetch_sub(aptr, val, memorder) ({I rrres=(intptr_t)*aptr; *aptr-=val; rrres;})
#define __atomic_fetch_add(aptr, val, memorder) ({I rrres=(intptr_t)*aptr; *aptr+=val; rrres;})
#define __atomic_fetch_and(aptr, val, memorder) ({I rrres=(intptr_t)*aptr; *aptr&=val; rrres;})
#endif
#define __atomic_add_fetch(aptr, val, memorder) (*aptr+=val)
#define __atomic_sub_fetch(aptr, val, memorder) (*aptr-=val)
#define __atomic_and_fetch(aptr, val, memorder) (*aptr&=val)
#define REPATGCLIM 0 // no repat
#endif
//convenient abbreviations
#define casa(p,e,d) __atomic_compare_exchange_n(p,e,d,0,__ATOMIC_ACQ_REL,__ATOMIC_RELAXED)
#define cass(p,e,d) __atomic_compare_exchange_n(p,e,d,0,__ATOMIC_SEQ_CST,__ATOMIC_SEQ_CST)
#define aadd(p,v) __atomic_fetch_add(p,v,__ATOMIC_ACQ_REL)
#define adda(p,v) __atomic_add_fetch(p,v,__ATOMIC_ACQ_REL)
#define lda(p) __atomic_load_n(p,__ATOMIC_ACQUIRE)
#define lds(p) __atomic_load_n(p,__ATOMIC_SEQ_CST)
#define sta(p,v) __atomic_store_n(p,v,__ATOMIC_RELEASE) //technically not 'a'
#define sts(p,v) __atomic_store_n(p,v,__ATOMIC_SEQ_CST)
#define xchga(p,n) __atomic_exchange_n(p,n,__ATOMIC_ACQ_REL)
// Tuning options for cip.c
#if defined(__aarch64__)
#define IGEMM_THRES (0) // when m*n*p less than this use cached; when higher, use BLAS
#define DGEMM_THRES (0) // when m*n*p less than this use cached; when higher, use BLAS 0 means 'always'
#define ZGEMM_THRES (0) // when m*n*p less than this use cached; when higher, use BLAS 0 means 'always'
#elif ((C_AVX2 || EMU_AVX2) && PYXES) || !defined(_OPENMP)
#define IGEMM_THRES (-1) // when m*n*p less than this use cached; when higher, use BLAS
#define DGEMM_THRES (-1) // when m*n*p less than this use cached; when higher, use BLAS _1 means 'never'
#define ZGEMM_THRES (-1) // when m*n*p less than this use cached; when higher, use BLAS _1 means 'never'
#elif defined(_WIN32)
// tuned for windows
#define IGEMM_THRES (400*400*400) // when m*n*p less than this use cached; when higher, use BLAS
#define DGEMM_THRES (300*300*300) // when m*n*p less than this use cached; when higher, use BLAS _1 means 'never'
#define ZGEMM_THRES (400*400*400) // when m*n*p less than this use cached; when higher, use BLAS
#else
// tuned for linux
#define IGEMM_THRES (200*200*200) // when m*n*p less than this use cached; when higher, use BLAS
#define DGEMM_THRES (200*200*200) // when m*n*p less than this use cached; when higher, use BLAS _1 means 'never'
#define ZGEMM_THRES (60*60*60) // when m*n*p less than this use cached; when higher, use BLAS
#endif
#define DCACHED_THRES (64*64*64) // when m*n*p less than this in a single thread use blocked; when higher, use cached
#define DCACHED_THRESn (24*24*24) // when m*n*p less than this, don't even look for multithreads; use blocked
#ifdef __x86_64__
#define FAST_AADD 1
#else
#define FAST_AADD 0
#endif
#define ADDBYTESINI1(t) (t=(t&ALTBYTES)+((t>>8)&ALTBYTES)) // sig in 01ff01ff01ff01ff, then xxxxxxxx03ff03ff, then xxxxxxxxxxxx07ff, then 00000000000007ff
#if BW==64
#define ALTBYTES 0x00ff00ff00ff00ffLL
#define ALTSHORTS 0x0000ffff0000ffffLL
// t has totals per byte-lane, result combines them into single total. t must be an lvalue
#define ADDBYTESINIn(t) (t = (t>>32) + t, t = (t>>16) + t, t&=0xffff) // sig in 01ff01ff01ff01ff, then xxxxxxxx03ff03ff, then xxxxxxxxxxxx07ff, then 00000000000007ff
#define VALIDBOOLEAN 0x0101010101010101LL // valid bits in a Boolean
#else
#define ALTBYTES 0x00ff00ffLL
#define ALTSHORTS 0x0000ffffLL
#define ADDBYTESINIn(t) (t = (t>>16) + t, t&=0xffff) // sig in 01ff01ff, then xxxx03ff, then 000003ff
#define VALIDBOOLEAN 0x01010101 // valid bits in a Boolean
#endif
#define ADDBYTESINI(t) (ADDBYTESINI1(t) , ADDBYTESINIn(t)) // sig in 01ff01ff, then xxxx03ff, then 000003ff
#define BOOLEANSIGN (VALIDBOOLEAN<<(BB-1))
// macros for bit testing
#define SGNIF(v,bitno) ((I)(v)<<(BW-1-(bitno))) // Sets sign bit if the numbered bit is set
#define SGNIF4(v,bitno) ((I4)(v)<<(32-1-(bitno))) // Sets sign bit if the numbered bit is set, in an I4
#define SGNONLYIF(v,bitno) (((v)>>(bitno))<<(BW-1)) // Sets sign bit if the numbered bit is set, clears all other bits
#define SGNIFNOT(v,bitno) (~SGNIF((v),(bitno))) // Clears sign bit if the numbered bit is set
#define REPSGN(x) ((I)(x)>>(BW-1)) // replicate sign bit of x to entire word
#define REPSGN4(x) ((I4)(x)>>(32-1)) // replicate sign bit of x to entire I4 - x is forced to I4
#define SGNTO0(x) ((UI)(x)>>(BW-1)) // move sign bit to bit 0, clear other bits
#define SGNTO0US(x) ((US)(x)>>(16-1)) // move sign bit to bit 0, clear other bits
#define A0 0 // a nonexistent A-block
#define ABS(a) (0<=(a)?(a):-(a))
#include "jr0.h" // #define ASSERT(b,e) {if(unlikely(!(b))){jsignal(e); R 0;}}
#define ASSERTF(b,e,s...) {if(unlikely(!(b))){jsignal(e); R 0;}}
#define ASSERTSUFF(b,e,suff) {if(unlikely(!(b))){jsignal(e); {suff}}} // when the cleanup is more than a goto
#define ASSERTGOTO(b,e,lbl) ASSERTSUFF(b,e,goto lbl;)
#define ASSERTTHREAD(b,e) {if(unlikely(!(b))){jtjsignal(jm,e); R 0;}} // used in io.c to signal in master thread
#define ASSERTD(b,s) {if(unlikely(!(b))){jsigd((s)); R 0;}}
#define ASSERTMTV(w) {ARGCHK1(w); ASSERT(1==AR(w),EVRANK); ASSERT(!AN(w),EVLENGTH);}
#define ASSERTN(b,e,nm) {if(unlikely(!(b))){jtjsignale(jt,(e)|EMSGLINEISNAME|EMSGNOMSGLINE,(nm),0); R 0;}} // signal error, overriding the running name with a different one
#define ASSERTNGOTO(b,e,nm,lbl) {if(unlikely(!(b))){jtjsignale(jt,(e)|EMSGLINEISNAME|EMSGNOMSGLINE,(nm),0); goto lbl;}} // same, but without the exit
#define ASSERTPYX(e) {jsignal((e)|(EMSGFROMPYX|EMSGNOEFORMAT)); R 0;}
#define ASSERTSYS(b,s) {if(unlikely(!(b))){fprintf(stderr,"system error: %s : file %s line %d\n",s,__FILE__,__LINE__); jsignal(EVSYSTEM); jtwri(JJTOJ(jt),MTYOSYS,"",(I)strlen(s),s); R 0;}}
#define ASSERTSYSV(b,s) {if(unlikely(!(b))){fprintf(stderr,"system error: %s : file %s line %d\n",s,__FILE__,__LINE__); jsignal(EVSYSTEM); jtwri(JJTOJ(jt),MTYOSYS,"",(I)strlen(s),s);}}
#define ASSERTW(b,e) {if(unlikely(!(b))){if((e)<=NEVM)jsignal(e); else jt->jerr=(e); R;}} // put error code into jerr, but signal only if nonretryable
#define ASSERTWR(c,e) {if(unlikely(!(c))){R e;}} // exit primitive with error code in return
// obsolete #if C_AVX512
// obsolete // We would like to use these AVX versions because they generate fewest instructions.
// Avoid call to memcmp to save registers
// obsolete #define ASSERTAGREECOMMON(x,y,l,ASTYPE) \
// obsolete {I *aaa=(x), *aab=(y); I aai=(l); \
// obsolete if(likely(aai<=4)){__mmask8 endmask=_bzhi_u32(0xf,aai); \
// obsolete endmask=_mm256_cmpneq_epi64_mask(_mm256_maskz_loadu_epi64(endmask,aaa),_mm256_maskz_loadu_epi64(endmask,aab)); \
// obsolete ASTYPE(!endmask,EVLENGTH); \
// obsolete }else{NOUNROLL do{--aai; ASTYPE(aaa[aai]==aab[aai],EVLENGTH)}while(aai);} \
// obsolete }
// obsolete #define TESTDISAGREE(r,x,y,l) \
// obsolete {I *aaa=(x), *aab=(y); I aai=(l); \
// obsolete if(likely(aai<=8)){__mmask8 endmask=_bzhi_u32(0xf,aai); \
// obsolete r=!!_mm256_cmpneq_epi64_mask(_mm256_maskz_loadu_epi64(endmask,aaa),_mm256_maskz_loadu_epi64(endmask,aab)); /* result is nonzero if any mismatch */ \
// obsolete }else{NOUNROLL do{--aai; r=0; if(aaa[aai]!=aab[aai]){r=1; break;}}while(aai);} \
// obsolete }
// obsolete #define TESTXITEMSMALL(r,x,y,l) \
// obsolete {I *aaa=(x), *aab=(y); I aai=(l); \
// obsolete if(likely(aai<=8)){__mmask8 endmask=_bzhi_u32(0xf,aai); \
// obsolete r=!!_mm256_cmpgt_epi64_mask(_mm256_maskz_loadu_epi64(endmask,aaa),_mm256_maskz_loadu_epi64(endmask,aab)); /* result is nonzero if any mismatch */ \
// obsolete }else{NOUNROLL do{--aai; r=0; if(unlikely(aaa[aai]>aab[aai])){r=1; break;}}while(aai);} \
// obsolete }
#if C_AVX2 || EMU_AVX2
// verify that shapes *x and *y match for l axes using AVX for rank<=vector size, loop otherwise
#define ASSERTAGREECOMMON(x,y,l,ASTYPE) \
{I *aaa=(I*)(x), *aab=(I*)(y); I aai=(l); \
if(likely(aai<=NPAR)){ \
ASTYPE(_mm256_testz_si256(_mm256_xor_si256(_mm256_loadu_si256((__m256i *)aaa),_mm256_loadu_si256((__m256i *)aab)),_mm256_loadu_si256((__m256i*)(validitymask+NPAR-aai))),EVLENGTH); /* result is 1 if all match */ \
}else{NOUNROLL do{--aai; ASTYPE(((I*)aaa)[aai]==((I*)aab)[aai],EVLENGTH)}while(aai);} \
}
// set r nonzero if shapes disagree
#define TESTDISAGREE(r,x,y,l) \
{I *aaa=(x), *aab=(y); I aai=(l); \
if(likely(aai<=NPAR)){ \
r=!(_mm256_testz_si256(_mm256_xor_si256(_mm256_loadu_si256((__m256i *)aaa),_mm256_loadu_si256((__m256i *)aab)),_mm256_loadu_si256((__m256i*)(validitymask+NPAR-aai)))); /* test result is 1 if all match */ \
}else{NOUNROLL do{--aai; r=0; if(unlikely(aaa[aai]!=aab[aai])){r=1; break;}}while(aai);} \
}
// set r nonzero if a value in x shape is bigger than corresponding one in y shape
#define TESTXITEMSMALL(r,x,y,l) \
{I *aaa=(x), *aab=(y); I aai=(l); \
if(likely(aai<=NPAR)){ \
r=!(_mm256_testz_si256(_mm256_cmpgt_epi64(_mm256_loadu_si256((__m256i *)aaa),_mm256_loadu_si256((__m256i *)aab)),_mm256_loadu_si256((__m256i*)(validitymask+NPAR-aai)))); /* test result is 1 if all match */ \
}else{NOUNROLL do{--aai; r=0; if(unlikely(aaa[aai]>aab[aai])){r=1; break;}}while(aai);} \
}
#else
#define ASSERTAGREECOMMON(x,y,l,ASTYPE) \
{I *aaa=(x), *aab=(y); I aai=(l); \
if(likely(aai<=2)){ \
aai-=1; aaa=(aai<0)?(I*)&validitymask[1]:aaa; aab=(aai<0)?(I*)&validitymask[1]:aab; \
ASTYPE(((aaa[0]^aab[0])+(aaa[aai]^aab[aai]))==0,EVLENGTH); \
}else{ASTYPE(!memcmp(aaa,aab,aai<<LGSZI),EVLENGTH)} \
}