Skip to content

Commit

Permalink
AArch64 host fixes and optimizations (#145)
Browse files Browse the repository at this point in the history
* Tweak to work on environments with sub-47-bit virtual addresses

* On AArch64, allow JITter to copy some micro-ops into JIT stream

* Optimize `adcx` & `adox` instruction emulation for AArch64 host

---------

Co-authored-by: tkchia <tkchia-cosmo@gmx.com>
  • Loading branch information
tkchia and tkchia committed Jun 21, 2023
1 parent b5fe131 commit 671aa0b
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 30 deletions.
5 changes: 5 additions & 0 deletions blink/intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,10 @@ typedef char char_xmma_t
#else
#define X86_INTRINSICS 0
#endif
#if defined(__aarch64__) && defined(__GNUC__)
#define ARM_INTRINSICS 1
#else
#define ARM_INTRINSICS 0
#endif

#endif /* BLINK_INTRIN_H_ */
43 changes: 27 additions & 16 deletions blink/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,33 @@
#endif

#ifdef __aarch64__
#define kArmJmp 0x14000000u // B
#define kArmCall 0x94000000u // BL
#define kArmRet 0xd65f03c0u // RET
#define kArmMovNex 0xf2800000u // sets sub-word of register to immediate
#define kArmMovZex 0xd2800000u // load immediate into reg w/ zero-extend
#define kArmMovSex 0x92800000u // load 1's complement imm w/ sign-extend
#define kArmDispMin -33554432 // can jump -2**25 ints backward
#define kArmDispMax +33554431 // can jump +2**25-1 ints forward
#define kArmDispMask 0x03ffffffu // mask of branch displacement
#define kArmRegOff 0 // bit offset of destination register
#define kArmRegMask 0x0000001fu // mask of destination register
#define kArmImmOff 5 // bit offset of mov immediate value
#define kArmImmMask 0x001fffe0u // bit offset of mov immediate value
#define kArmImmMax 0xffffu // maximum immediate value per instruction
#define kArmIdxOff 21 // bit offset of u16[4] sub-word index
#define kArmIdxMask 0x00600000u // mask of u16[4] sub-word index
#define kArmJmp 0x14000000u // B
#define kArmCall 0x94000000u // BL
#define kArmRet 0xd65f03c0u // RET
#define kArmMovNex 0xf2800000u // sets sub-word of register to immediate
#define kArmMovZex 0xd2800000u // load immediate into reg w/ zero-extend
#define kArmMovSex 0x92800000u // load 1's complement imm w/ sign-extend
#define kArmAdr 0x10000000u // form PC-relative byte address
#define kArmAdrp 0x90000000u // form PC-relative page address
#define kArmLdrPc 0x18000000u // load PC-relative memory into register
// (general or SIMD)
#define kArmLdrswPc 0x98000000u // load PC-relative short w/ sign-extend
#define kArmPrfmPc 0xd8000000u // prefetch PC-relative memory
#define kArmDispMin -33554432 // can jump -2**25 ints backward
#define kArmDispMax +33554431 // can jump +2**25-1 ints forward
#define kArmDispMask 0x03ffffffu // mask of branch displacement
#define kArmRegOff 0 // bit offset of destination register
#define kArmRegMask 0x0000001fu // mask of destination register
#define kArmImmOff 5 // bit offset of mov immediate value
#define kArmImmMask 0x001fffe0u // bit offset of mov immediate value
#define kArmImmMax 0xffffu // maximum immediate value per instruction
#define kArmIdxOff 21 // bit offset of u16[4] sub-word index
#define kArmIdxMask 0x00600000u // mask of u16[4] sub-word index
#define kArmAdrMask 0x9f000000u // mask of ADR opcode
#define kArmAdrpMask 0x9f000000u // mask of ADRP opcode
#define kArmLdrPcMask 0xbb000000u // mask of PC-relative LDR opcodes
#define kArmLdrswPcMask 0xff000000u // mask of PC-relative LDRSW opcode
#define kArmPrfmPcMask 0xff000000u // mask of PC-relative PRFM opcode
#endif

#define JITJUMP_CONTAINER(e) DLL_CONTAINER(struct JitJump, elem, e)
Expand Down
3 changes: 3 additions & 0 deletions blink/loader.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,9 @@ error: unsupported executable; we need:\n\
} else if (READ64(map) == READ64("MZqFpD='") ||
READ64(map) == READ64("jartsr='")) {
m->system->iscosmo = true;
// Cosmopolitan programs pretty much require at least 47-bit virtual
// addresses; if the host lacks these, then emulate them w/ software
if (FLAG_vabits < 47) FLAG_nolinear = true;
if (GetElfHeader(tmp, prog, (const char *)map) == -1) exit(127);
memcpy(map, tmp, 64);
execstack = LoadElf(m, elf, (Elf64_Ehdr_ *)map, mapsize, fd);
Expand Down
4 changes: 4 additions & 0 deletions blink/map.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ static int GetBitsInAddressSpace(void) {
for (i = 16; i < 40; ++i) {
want = UINT64_C(0x8123000000000000) >> i;
if (want > UINTPTR_MAX) continue;
if (Msync((void *)(uintptr_t)want, 1, MS_ASYNC, "vabits") == 0 ||
errno == EBUSY) {
return 64 - i;
}
ptr = PortableMmap((void *)(uintptr_t)want, 1, PROT_READ,
MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS_, -1, 0);
if (ptr != MAP_FAILED) {
Expand Down
83 changes: 70 additions & 13 deletions blink/uop.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,56 @@ MICRO_OP i64 Adox64(u64 x, u64 y, struct Machine *m) {
: "cc");
return z;
}
#else /* !X86_INTRINSICS */
#elif ARM_INTRINSICS /* !X86_INTRINSICS */
MICRO_OP i64 Adcx32(u64 x, u64 y, struct Machine *m) {
u64 f = m->flags, z;
_Static_assert(CF == 1, "");
asm("ror\t%1,%1,#1\n\t"
"adds\t%1,%1,%1\n\t"
"adcs\t%w0,%w2,%w3\n\t"
"adc\t%1,%1,xzr"
: "=&r" (z), "+&r" (f) : "%0" (x), "r" (y) : "cc");
m->flags = f;
return z;
}
MICRO_OP i64 Adcx64(u64 x, u64 y, struct Machine *m) {
u64 f = m->flags, z;
_Static_assert(CF == 1, "");
asm("ror\t%1,%1,#1\n\t"
"adds\t%1,%1,%1\n\t"
"adcs\t%0,%2,%3\n\t"
"adc\t%1,%1,xzr"
: "=&r" (z), "+&r" (f) : "%0" (x), "r" (y) : "cc");
m->flags = f;
return z;
}
MICRO_OP i64 Adox32(u64 x, u64 y, struct Machine *m) {
u64 f = m->flags, z;
asm("ror\t%1,%1,%4\n\t"
"adds\t%1,%1,%1\n\t"
"adcs\t%w0,%w2,%w3\n\t"
"adc\t%1,%1,xzr\n\t"
"ror\t%1,%1,%5"
: "=&r" (z), "+&r" (f)
: "%0" (x), "r" (y), "i" (FLAGS_OF + 1), "i" (64 - FLAGS_OF)
: "cc");
m->flags = f;
return z;
}
MICRO_OP i64 Adox64(u64 x, u64 y, struct Machine *m) {
u64 f = m->flags, z;
asm("ror\t%1,%1,%4\n\t"
"adds\t%1,%1,%1\n\t"
"adcs\t%0,%2,%3\n\t"
"adc\t%1,%1,xzr\n\t"
"ror\t%1,%1,%5"
: "=&r" (z), "+&r" (f)
: "%0" (x), "r" (y), "i" (FLAGS_OF + 1), "i" (64 - FLAGS_OF)
: "cc");
m->flags = f;
return z;
}
#else /* !ARM_INTRINSICS */
MICRO_OP i64 Adcx32(u64 x, u64 y, struct Machine *m) {
u32 t = x + !!(m->flags & CF);
u32 z = t + y;
Expand Down Expand Up @@ -161,7 +210,7 @@ MICRO_OP i64 Adox64(u64 x, u64 y, struct Machine *m) {
m->flags = (m->flags & ~OF) | c << FLAGS_OF;
return z;
}
#endif /* !X86_INTRINSICS */
#endif /* !ARM_INTRINSICS */

#endif /* !DISABLE_BMI2 */

Expand Down Expand Up @@ -743,7 +792,8 @@ MICRO_OP void MovsdWpsVpsOp(u8 *p, struct Machine *m, long reg) {
Write64(p, Read64(m->xmm[reg]));
}

#if defined(__x86_64__) && defined(TRIVIALLY_RELOCATABLE)
#if (defined(__x86_64__) || defined(__aarch64__)) && \
defined(TRIVIALLY_RELOCATABLE)
#define LOADSTORE "m"

MICRO_OP static i64 NativeLoad8(const u8 *p) {
Expand Down Expand Up @@ -1492,25 +1542,32 @@ static bool IsRet(u8 *p) {

static long GetInstructionLength(u8 *p) {
#if defined(__aarch64__)
#ifndef NDEBUG
if ((Get32(p) & ~kArmDispMask) == kArmJmp) return -1;
if ((Get32(p) & ~kArmDispMask) == kArmCall) return -1;
#endif
// on AArch64, do not recognize instructions which are known to be not
// trivially relocatable i.e. opcodes which do PC-relative addressing;
// but as exceptions, allow CBZ, CBNZ, TBZ, TBNZ, & B.cond
u32 ins = Get32(p);
if ((ins & ~kArmDispMask) == kArmJmp) return -1;
if ((ins & ~kArmDispMask) == kArmCall) return -1;
if ((ins & kArmAdrMask) == kArmAdr) return -1;
if ((ins & kArmAdrpMask) == kArmAdrp) return -1;
if ((ins & kArmLdrPcMask) == kArmLdrPc) return -1;
if ((ins & kArmLdrswPcMask) == kArmLdrswPc) return -1;
if ((ins & kArmPrfmPcMask) == kArmPrfmPc) return -1;
return 4;
#elif defined(__x86_64__)
#elif defined(__x86_64__) /* !__aarch64__ */
struct XedDecodedInst x;
unassert(!DecodeInstruction(&x, p, 15, XED_MODE_LONG));
#ifndef NDEBUG
if (ClassifyOp(x.op.rde) == kOpBranching) return -1;
if (UsesStaticMemory(x.op.rde)) return -1;
#endif
#endif /* NDEBUG */
return x.length;
#else
#else /* !__x86_64__ */
__builtin_unreachable();
#endif
#endif /* !__x86_64__ */
}

long GetMicroOpLengthImpl(void *uop) {
static long GetMicroOpLengthImpl(void *uop) {
long k, n = 0;
for (;;) {
if (IsRet((u8 *)uop + n)) return n;
Expand All @@ -1520,7 +1577,7 @@ long GetMicroOpLengthImpl(void *uop) {
}
}

long GetMicroOpLength(void *uop) {
static long GetMicroOpLength(void *uop) {
_Static_assert(IS2POW(kMaxOps), "");
static unsigned count;
static void *ops[kMaxOps * 2];
Expand Down
10 changes: 9 additions & 1 deletion test/func/mmap_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>

Expand Down Expand Up @@ -100,10 +101,17 @@ TEST(mmap, suggestedAddressWithoutMapFixed_isUsedIfAvailable) {
want = (void *)(intptr_t)0x300000000000;
got1 = mmap(want, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_EQ((intptr_t)want, (intptr_t)got1);
while (got1 == MAP_FAILED && errno == ENOMEM) {
want = (void *)((intptr_t)want >> 1);
ASSERT_EQ((intptr_t)want % pagesize, 0);
got1 = mmap(want, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}
ASSERT_NE((intptr_t)MAP_FAILED, (intptr_t)got1);
got2 = mmap(want, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE((intptr_t)MAP_FAILED, (intptr_t)got2);
ASSERT_NE((intptr_t)got1, (intptr_t)got2);
ASSERT_NE((intptr_t)want, (intptr_t)got2);
ASSERT_EQ(0, munmap(got1, pagesize));
ASSERT_EQ(0, munmap(got2, pagesize));
Expand Down

0 comments on commit 671aa0b

Please sign in to comment.