Skip to content

Commit

Permalink
Runtime SSE4.2 detection:
Browse files Browse the repository at this point in the history
Detect presence of SSE4.2 as early as possible
and choose between SSE and non-SSE string hashing functions at runtime.

This allows to build luajit without -msse4.2, but still get a performace gain
on supported hardware.
  • Loading branch information
isage committed Apr 26, 2018
1 parent 10aeff6 commit 0fcdf12
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 139 deletions.
101 changes: 1 addition & 100 deletions src/lib_jit.c
Original file line number Diff line number Diff line change
Expand Up @@ -659,114 +659,15 @@ JIT_PARAMDEF(JIT_PARAMINIT)
};
#endif

#if LJ_TARGET_ARM && LJ_TARGET_LINUX
#include <sys/utsname.h>
#endif

/* Arch-dependent CPU detection. */
static uint32_t jit_cpudetect(lua_State *L)
{
uint32_t flags = 0;
#if LJ_TARGET_X86ORX64
uint32_t vendor[4];
uint32_t features[4];
if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
#if !LJ_HASJIT
#define JIT_F_SSE2 2
#endif
flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
#if LJ_HASJIT
flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
if (vendor[2] == 0x6c65746e) { /* Intel. */
if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
flags |= JIT_F_LEA_AGU;
} else if (vendor[2] == 0x444d4163) { /* AMD. */
uint32_t fam = (features[0] & 0x0ff00f00);
if (fam >= 0x00000f00) /* K8, K10. */
flags |= JIT_F_PREFER_IMUL;
}
if (vendor[0] >= 7) {
uint32_t xfeatures[4];
lj_vm_cpuid(7, xfeatures);
flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
}
#endif
}
/* Check for required instruction set support on x86 (unnecessary on x64). */
#if LJ_TARGET_X86
if (!(flags & JIT_F_SSE2))
luaL_error(L, "CPU with SSE2 required");
#endif
#elif LJ_TARGET_ARM
#if LJ_HASJIT
int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
#if LJ_TARGET_LINUX
if (ver < 70) { /* Runtime ARM CPU detection. */
struct utsname ut;
uname(&ut);
if (strncmp(ut.machine, "armv", 4) == 0) {
if (ut.machine[4] >= '7')
ver = 70;
else if (ut.machine[4] == '6')
ver = 60;
}
}
#endif
flags |= ver >= 70 ? JIT_F_ARMV7 :
ver >= 61 ? JIT_F_ARMV6T2_ :
ver >= 60 ? JIT_F_ARMV6_ : 0;
flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2;
#endif
#elif LJ_TARGET_ARM64
/* No optional CPU features to detect (for now). */
#elif LJ_TARGET_PPC
#if LJ_HASJIT
#if LJ_ARCH_SQRT
flags |= JIT_F_SQRT;
#endif
#if LJ_ARCH_ROUND
flags |= JIT_F_ROUND;
#endif
#endif
#elif LJ_TARGET_MIPS
#if LJ_HASJIT
/* Compile-time MIPS CPU detection. */
#if LJ_ARCH_VERSION >= 20
flags |= JIT_F_MIPSXXR2;
#endif
/* Runtime MIPS CPU detection. */
#if defined(__GNUC__)
if (!(flags & JIT_F_MIPSXXR2)) {
int x;
#ifdef __mips16
x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */
#else
/* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */
__asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2");
#endif
if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */
}
#endif
#endif
#else
#error "Missing CPU detection for this architecture"
#endif
UNUSED(L);
return flags;
}

/* Initialize JIT compiler. */
static void jit_init(lua_State *L)
{
uint32_t flags = jit_cpudetect(L);
#if LJ_HASJIT
jit_State *J = L2J(L);
J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
J->flags = J->flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
memcpy(J->param, jit_param_default, sizeof(J->param));
lj_dispatch_update(G(L));
#else
UNUSED(flags);
#endif
}

Expand Down
1 change: 1 addition & 0 deletions src/lj_jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define JIT_F_PREFER_IMUL 0x00000080
#define JIT_F_LEA_AGU 0x00000100
#define JIT_F_BMI2 0x00000200
#define JIT_F_SSE4_2 0x00000400

/* Names for the CPU-specific flags. Must match the order above. */
#define JIT_F_CPU_FIRST JIT_F_SSE2
Expand Down
114 changes: 114 additions & 0 deletions src/lj_state.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,106 @@ static void close_state(lua_State *L)
g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0);
}

#if LJ_TARGET_ARM && LJ_TARGET_LINUX
#include <sys/utsname.h>
#endif

/* Arch-dependent CPU detection. */
static uint32_t _cpudetect(lua_State *L)
{
uint32_t flags = 0;
#if LJ_TARGET_X86ORX64
uint32_t vendor[4];
uint32_t features[4];
if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
#if !LJ_HASJIT
#define JIT_F_SSE2 2
#endif
flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
#if LJ_HASJIT
flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2;
if (vendor[2] == 0x6c65746e) { /* Intel. */
if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
flags |= JIT_F_LEA_AGU;
} else if (vendor[2] == 0x444d4163) { /* AMD. */
uint32_t fam = (features[0] & 0x0ff00f00);
if (fam >= 0x00000f00) /* K8, K10. */
flags |= JIT_F_PREFER_IMUL;
}
if (vendor[0] >= 7) {
uint32_t xfeatures[4];
lj_vm_cpuid(7, xfeatures);
flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
}
#endif
}
/* Check for required instruction set support on x86 (unnecessary on x64). */
#if LJ_TARGET_X86
if (!(flags & JIT_F_SSE2))
luaL_error(L, "CPU with SSE2 required");
#endif
#elif LJ_TARGET_ARM
#if LJ_HASJIT
int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
#if LJ_TARGET_LINUX
if (ver < 70) { /* Runtime ARM CPU detection. */
struct utsname ut;
uname(&ut);
if (strncmp(ut.machine, "armv", 4) == 0) {
if (ut.machine[4] >= '7')
ver = 70;
else if (ut.machine[4] == '6')
ver = 60;
}
}
#endif
flags |= ver >= 70 ? JIT_F_ARMV7 :
ver >= 61 ? JIT_F_ARMV6T2_ :
ver >= 60 ? JIT_F_ARMV6_ : 0;
flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2;
#endif
#elif LJ_TARGET_ARM64
/* No optional CPU features to detect (for now). */
#elif LJ_TARGET_PPC
#if LJ_HASJIT
#if LJ_ARCH_SQRT
flags |= JIT_F_SQRT;
#endif
#if LJ_ARCH_ROUND
flags |= JIT_F_ROUND;
#endif
#endif
#elif LJ_TARGET_MIPS
#if LJ_HASJIT
/* Compile-time MIPS CPU detection. */
#if LJ_ARCH_VERSION >= 20
flags |= JIT_F_MIPSXXR2;
#endif
/* Runtime MIPS CPU detection. */
#if defined(__GNUC__)
if (!(flags & JIT_F_MIPSXXR2)) {
int x;
#ifdef __mips16
x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */
#else
/* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */
__asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2");
#endif
if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */
}
#endif
#endif
#else
#error "Missing CPU detection for this architecture"
#endif
UNUSED(L);
return flags;
}

extern void x64_init_random();

#if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC))
lua_State *lj_state_newstate(lua_Alloc f, void *ud)
#else
Expand All @@ -188,7 +288,18 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud)
{
GG_State *GG = (GG_State *)f(ud, NULL, 0, sizeof(GG_State));
lua_State *L = &GG->L;

/* detect cpu features as early as possible */
/* and init random table if we have SSE4.2 support */
uint32_t flags = _cpudetect(L);

if (flags & JIT_F_SSE4_2)
{
x64_init_random();
}

global_State *g = &GG->g;

if (GG == NULL || !checkptrGC(GG)) return NULL;
memset(GG, 0, sizeof(GG_State));
L->gct = ~LJ_TTHREAD;
Expand Down Expand Up @@ -219,6 +330,9 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud)
g->gc.stepmul = LUAI_GCMUL;
lj_dispatch_init((GG_State *)L);
L->status = LUA_ERRERR+1; /* Avoid touching the stack upon memory error. */

G2J(g)->flags = flags; /* copy detected flags to jit state */

if (lj_vm_cpcall(L, NULL, NULL, cpluaopen) != 0) {
/* Memory allocation error: free partial state. */
close_state(L);
Expand Down
17 changes: 10 additions & 7 deletions src/lj_str.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "lj_err.h"
#include "lj_str.h"
#include "lj_char.h"
#include "lj_dispatch.h" /* for G2J */

/* -- String helpers ------------------------------------------------------ */

Expand Down Expand Up @@ -165,12 +166,6 @@ lj_str_indep_hash(GCstr *str) {

#include "x64/src/lj_str_hash_x64.h"

#if defined(LJ_ARCH_STR_HASH)
#define LJ_STR_HASH LJ_ARCH_STR_HASH
#else
#define LJ_STR_HASH lj_str_original_hash
#endif

/* Intern a string and return string object. */
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
{
Expand All @@ -187,7 +182,15 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
return &g->strempty;
}

h = LJ_STR_HASH(str, lenx);
/* switch between sse and non-sse hash branches */
if ((G2J(g)->flags & JIT_F_SSE4_2))
{
h = lj_str_sse_hash(str, lenx);
}
else
{
h = lj_str_original_hash(str, lenx);
}

/* Check if the string has already been interned. */
o = gcref(g->strhash[h & g->strmask]);
Expand Down
Loading

0 comments on commit 0fcdf12

Please sign in to comment.