diff --git a/src/lib_jit.c b/src/lib_jit.c index 8768a20cc..b6efa4147 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -659,114 +659,15 @@ JIT_PARAMDEF(JIT_PARAMINIT) }; #endif -#if LJ_TARGET_ARM && LJ_TARGET_LINUX -#include -#endif - -/* Arch-dependent CPU detection. */ -static uint32_t jit_cpudetect(lua_State *L) -{ - uint32_t flags = 0; -#if LJ_TARGET_X86ORX64 - uint32_t vendor[4]; - uint32_t features[4]; - if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { -#if !LJ_HASJIT -#define JIT_F_SSE2 2 -#endif - flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; -#if LJ_HASJIT - flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; - flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; - if (vendor[2] == 0x6c65746e) { /* Intel. */ - if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ - flags |= JIT_F_LEA_AGU; - } else if (vendor[2] == 0x444d4163) { /* AMD. */ - uint32_t fam = (features[0] & 0x0ff00f00); - if (fam >= 0x00000f00) /* K8, K10. */ - flags |= JIT_F_PREFER_IMUL; - } - if (vendor[0] >= 7) { - uint32_t xfeatures[4]; - lj_vm_cpuid(7, xfeatures); - flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; - } -#endif - } - /* Check for required instruction set support on x86 (unnecessary on x64). */ -#if LJ_TARGET_X86 - if (!(flags & JIT_F_SSE2)) - luaL_error(L, "CPU with SSE2 required"); -#endif -#elif LJ_TARGET_ARM -#if LJ_HASJIT - int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ -#if LJ_TARGET_LINUX - if (ver < 70) { /* Runtime ARM CPU detection. */ - struct utsname ut; - uname(&ut); - if (strncmp(ut.machine, "armv", 4) == 0) { - if (ut.machine[4] >= '7') - ver = 70; - else if (ut.machine[4] == '6') - ver = 60; - } - } -#endif - flags |= ver >= 70 ? JIT_F_ARMV7 : - ver >= 61 ? JIT_F_ARMV6T2_ : - ver >= 60 ? JIT_F_ARMV6_ : 0; - flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; -#endif -#elif LJ_TARGET_ARM64 - /* No optional CPU features to detect (for now). */ -#elif LJ_TARGET_PPC -#if LJ_HASJIT -#if LJ_ARCH_SQRT - flags |= JIT_F_SQRT; -#endif -#if LJ_ARCH_ROUND - flags |= JIT_F_ROUND; -#endif -#endif -#elif LJ_TARGET_MIPS -#if LJ_HASJIT - /* Compile-time MIPS CPU detection. */ -#if LJ_ARCH_VERSION >= 20 - flags |= JIT_F_MIPSXXR2; -#endif - /* Runtime MIPS CPU detection. */ -#if defined(__GNUC__) - if (!(flags & JIT_F_MIPSXXR2)) { - int x; -#ifdef __mips16 - x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ -#else - /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ - __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); -#endif - if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ - } -#endif -#endif -#else -#error "Missing CPU detection for this architecture" -#endif - UNUSED(L); - return flags; -} /* Initialize JIT compiler. */ static void jit_init(lua_State *L) { - uint32_t flags = jit_cpudetect(L); #if LJ_HASJIT jit_State *J = L2J(L); - J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT; + J->flags = J->flags | JIT_F_ON | JIT_F_OPT_DEFAULT; memcpy(J->param, jit_param_default, sizeof(J->param)); lj_dispatch_update(G(L)); -#else - UNUSED(flags); #endif } diff --git a/src/lj_jit.h b/src/lj_jit.h index 2fea3859b..ef28dacc3 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -20,6 +20,7 @@ #define JIT_F_PREFER_IMUL 0x00000080 #define JIT_F_LEA_AGU 0x00000100 #define JIT_F_BMI2 0x00000200 +#define JIT_F_SSE4_2 0x00000400 /* Names for the CPU-specific flags. Must match the order above. */ #define JIT_F_CPU_FIRST JIT_F_SSE2 diff --git a/src/lj_state.c b/src/lj_state.c index 632dd07e5..40f139dc4 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -180,6 +180,106 @@ static void close_state(lua_State *L) g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0); } +#if LJ_TARGET_ARM && LJ_TARGET_LINUX +#include +#endif + +/* Arch-dependent CPU detection. */ +static uint32_t _cpudetect(lua_State *L) +{ + uint32_t flags = 0; +#if LJ_TARGET_X86ORX64 + uint32_t vendor[4]; + uint32_t features[4]; + if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { +#if !LJ_HASJIT +#define JIT_F_SSE2 2 +#endif + flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; +#if LJ_HASJIT + flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; + flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; + flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2; + if (vendor[2] == 0x6c65746e) { /* Intel. */ + if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ + flags |= JIT_F_LEA_AGU; + } else if (vendor[2] == 0x444d4163) { /* AMD. */ + uint32_t fam = (features[0] & 0x0ff00f00); + if (fam >= 0x00000f00) /* K8, K10. */ + flags |= JIT_F_PREFER_IMUL; + } + if (vendor[0] >= 7) { + uint32_t xfeatures[4]; + lj_vm_cpuid(7, xfeatures); + flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; + } +#endif + } + /* Check for required instruction set support on x86 (unnecessary on x64). */ +#if LJ_TARGET_X86 + if (!(flags & JIT_F_SSE2)) + luaL_error(L, "CPU with SSE2 required"); +#endif +#elif LJ_TARGET_ARM +#if LJ_HASJIT + int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ +#if LJ_TARGET_LINUX + if (ver < 70) { /* Runtime ARM CPU detection. */ + struct utsname ut; + uname(&ut); + if (strncmp(ut.machine, "armv", 4) == 0) { + if (ut.machine[4] >= '7') + ver = 70; + else if (ut.machine[4] == '6') + ver = 60; + } + } +#endif + flags |= ver >= 70 ? JIT_F_ARMV7 : + ver >= 61 ? JIT_F_ARMV6T2_ : + ver >= 60 ? JIT_F_ARMV6_ : 0; + flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; +#endif +#elif LJ_TARGET_ARM64 + /* No optional CPU features to detect (for now). */ +#elif LJ_TARGET_PPC +#if LJ_HASJIT +#if LJ_ARCH_SQRT + flags |= JIT_F_SQRT; +#endif +#if LJ_ARCH_ROUND + flags |= JIT_F_ROUND; +#endif +#endif +#elif LJ_TARGET_MIPS +#if LJ_HASJIT + /* Compile-time MIPS CPU detection. */ +#if LJ_ARCH_VERSION >= 20 + flags |= JIT_F_MIPSXXR2; +#endif + /* Runtime MIPS CPU detection. */ +#if defined(__GNUC__) + if (!(flags & JIT_F_MIPSXXR2)) { + int x; +#ifdef __mips16 + x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ +#else + /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ + __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); +#endif + if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ + } +#endif +#endif +#else +#error "Missing CPU detection for this architecture" +#endif + UNUSED(L); + return flags; +} + +extern void x64_init_random(); + #if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC)) lua_State *lj_state_newstate(lua_Alloc f, void *ud) #else @@ -188,7 +288,18 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) { GG_State *GG = (GG_State *)f(ud, NULL, 0, sizeof(GG_State)); lua_State *L = &GG->L; + + /* detect cpu features as early as possible */ + /* and init random table if we have SSE4.2 support */ + uint32_t flags = _cpudetect(L); + + if (flags & JIT_F_SSE4_2) + { + x64_init_random(); + } + global_State *g = &GG->g; + if (GG == NULL || !checkptrGC(GG)) return NULL; memset(GG, 0, sizeof(GG_State)); L->gct = ~LJ_TTHREAD; @@ -219,6 +330,9 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) g->gc.stepmul = LUAI_GCMUL; lj_dispatch_init((GG_State *)L); L->status = LUA_ERRERR+1; /* Avoid touching the stack upon memory error. */ + + G2J(g)->flags = flags; /* copy detected flags to jit state */ + if (lj_vm_cpcall(L, NULL, NULL, cpluaopen) != 0) { /* Memory allocation error: free partial state. */ close_state(L); diff --git a/src/lj_str.c b/src/lj_str.c index b9469ca00..3fc04fefa 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -11,6 +11,7 @@ #include "lj_err.h" #include "lj_str.h" #include "lj_char.h" +#include "lj_dispatch.h" /* for G2J */ /* -- String helpers ------------------------------------------------------ */ @@ -165,12 +166,6 @@ lj_str_indep_hash(GCstr *str) { #include "x64/src/lj_str_hash_x64.h" -#if defined(LJ_ARCH_STR_HASH) -#define LJ_STR_HASH LJ_ARCH_STR_HASH -#else -#define LJ_STR_HASH lj_str_original_hash -#endif - /* Intern a string and return string object. */ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) { @@ -187,7 +182,15 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) return &g->strempty; } - h = LJ_STR_HASH(str, lenx); + /* switch between sse and non-sse hash branches */ + if ((G2J(g)->flags & JIT_F_SSE4_2)) + { + h = lj_str_sse_hash(str, lenx); + } + else + { + h = lj_str_original_hash(str, lenx); + } /* Check if the string has already been interned. */ o = gcref(g->strhash[h & g->strmask]); diff --git a/src/x64/src/lj_str_hash_x64.h b/src/x64/src/lj_str_hash_x64.h index 063f631c7..ca1ac1919 100644 --- a/src/x64/src/lj_str_hash_x64.h +++ b/src/x64/src/lj_str_hash_x64.h @@ -8,13 +8,12 @@ #ifndef _LJ_STR_HASH_X64_H_ #define _LJ_STR_HASH_X64_H_ -#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__) +#if defined(__x86_64) && defined(__GNUC__) #include #include #include #include -#include #include "../../lj_def.h" @@ -26,6 +25,24 @@ #define srandom(seed) srand(seed) #endif +inline __attribute__((always_inline)) uint64_t asm_crc32_u64(uint64_t crc, uint64_t value) { +// printf("CRC!"); + asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline __attribute__((always_inline)) uint32_t asm_crc32_u32(uint32_t crc, uint32_t value) { +// printf("CRC!"); + asm("crc32l %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + +inline __attribute__((always_inline)) uint32_t asm_crc32_u8(uint32_t crc, uint8_t value) { +// printf("CRC!"); + asm("crc32b %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); + return crc; +} + static const uint64_t* cast_uint64p(const char* str) { return (const uint64_t*)(void*)str; @@ -48,7 +65,7 @@ static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len) v = (v << 8) | str[len >> 1]; v = (v << 8) | str[len - 1]; v = (v << 8) | len; - return _mm_crc32_u32(0, v); + return asm_crc32_u32(0, v); #else uint32_t a, b, h = len; @@ -78,9 +95,9 @@ static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len) v2 = *cast_uint32p(str + len - 4); } - h = _mm_crc32_u32(0, len); - h = _mm_crc32_u64(h, v1); - h = _mm_crc32_u64(h, v2); + h = asm_crc32_u32(0, len); + h = asm_crc32_u64(h, v1); + h = asm_crc32_u64(h, v2); return h; } @@ -90,18 +107,18 @@ static uint32_t lj_str_hash_16_128(const char* str, uint32_t len) uint64_t h1, h2; uint32_t i; - h1 = _mm_crc32_u32(0, len); + h1 = asm_crc32_u32(0, len); h2 = 0; for (i = 0; i < len - 16; i += 16) { - h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i)); - h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8)); + h1 += asm_crc32_u64(h1, *cast_uint64p(str + i)); + h2 += asm_crc32_u64(h2, *cast_uint64p(str + i + 8)); }; - h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = asm_crc32_u64(h1, *cast_uint64p(str + len - 16)); + h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - return _mm_crc32_u32(h1, h2); + return asm_crc32_u32(h1, h2); } /* ************************************************************************** @@ -144,7 +161,7 @@ static LJ_AINLINE uint32_t log2_floor(uint32_t n) /* This function is to populate `random_pos` such that random_pos[i][*] * contains random value in the range of [2**i, 2**(i+1)). */ -static void x64_init_random(void) +void x64_init_random(void) { int i, seed, rml; @@ -155,8 +172,8 @@ static void x64_init_random(void) } /* Init seed */ - seed = _mm_crc32_u32(0, getpid()); - seed = _mm_crc32_u32(seed, time(NULL)); + seed = asm_crc32_u32(0, getpid()); + seed = asm_crc32_u32(seed, time(NULL)); srandom(seed); /* Now start to populate the random_pos[][]. */ @@ -185,11 +202,6 @@ static void x64_init_random(void) } #undef POW2_MASK -void __attribute__((constructor)) x64_init_random_constructor() -{ - x64_init_random(); -} - /* Return a pre-computed random number in the range of [1**chunk_sz_order, * 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value * may be greater than chunk-size; it is up to the caller to make sure @@ -216,7 +228,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, pos1 = get_random_pos_unsafe(chunk_sz_log2, 0); pos2 = get_random_pos_unsafe(chunk_sz_log2, 1); - h1 = _mm_crc32_u32(0, len); + h1 = asm_crc32_u32(0, len); h2 = 0; /* loop over 14 chunks, 2 chunks at a time */ @@ -224,29 +236,29 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, chunk_ptr += chunk_sz, i++) { v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = asm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz + pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = asm_crc32_u64(h2, v); } /* the last two chunks */ v = *cast_uint64p(chunk_ptr + pos1); - h1 = _mm_crc32_u64(h1, v); + h1 = asm_crc32_u64(h1, v); v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2); - h2 = _mm_crc32_u64(h2, v); + h2 = asm_crc32_u64(h2, v); /* process the trailing part */ - h1 = _mm_crc32_u64(h1, *cast_uint64p(str)); - h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8)); + h1 = asm_crc32_u64(h1, *cast_uint64p(str)); + h2 = asm_crc32_u64(h2, *cast_uint64p(str + len - 8)); - h1 = _mm_crc32_u32(h1, h2); + h1 = asm_crc32_u32(h1, h2); return h1; } /* NOTE: the "len" should not be zero */ -static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) +static LJ_AINLINE uint32_t lj_str_sse_hash(const char* str, size_t len) { if (len < 128) { if (len >= 16) { /* [16, 128) */ @@ -264,8 +276,5 @@ static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) return lj_str_hash_128_above(str, len); } -#define LJ_ARCH_STR_HASH lj_str_hash -#else -#undef LJ_ARCH_STR_HASH #endif #endif /*_LJ_STR_HASH_X64_H_*/ diff --git a/src/x64/test/unit_test.sh b/src/x64/test/unit_test.sh old mode 100644 new mode 100755