Add a crapton of SSE instructions

mfence, ucomisd, comisd, movmskpd, andpd, punpckldq, psubq, comiss, andps, andnps, maxsd, cmpsd. Enough for a node repl. #90
ish-app · Jun 9, 2020 · 55816c8 · 55816c8
1 parent 8d84030
commit 55816c8
Show file tree

Hide file tree

Showing 5 changed files with 255 additions and 122 deletions.
diff --git a/emu/decode.h b/emu/decode.h
@@ -64,10 +64,6 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                 case 0x29: TRACEI("movaps xmm, xmm:modrm");
                            READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;
 
-                case 0x2e: TRACEI("ucomiss xmm, xmm:modrm");
-                           READMODRM; VCOMPARE(xmm_modrm_val, xmm_modrm_reg,32);
-                           break;
-
                 case 0x31: TRACEI("rdtsc");
                            RDTSC; break;
 
@@ -190,6 +186,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                 case 0xad: TRACEI("shrd cl, reg, modrm");
                            READMODRM; SHRD(reg_c, modrm_reg, modrm_val,oz); break;
 
+                case 0xae: TRACEI("fence"); READMODRM; break;
+
                 case 0xaf: TRACEI("imul modrm, reg");
                            READMODRM; IMUL2(modrm_val, modrm_reg,oz); break;
 
@@ -264,10 +262,22 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
 #endif
 
 #if OP_SIZE == 16
+                case 0x2e: TRACEI("ucomisd xmm, xmm:modrm");
+                           READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,64); break;
+                case 0x2f: TRACEI("comisd xmm, xmm:modrm");
+                           READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,64); break;
+
+                case 0x50: TRACEI("movmskpd xmm:modrm, reg");
+                           READMODRM; V_OP(fmovmask_d, xmm_modrm_val, modrm_reg,128); break;
+
+                case 0x54: TRACEI("andpd xmm:modrm, xmm");
+                           READMODRM; V_OP(and, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x56: TRACEI("orpd xmm:modrm, xmm");
                            READMODRM; V_OP(or, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x60: TRACEI("punpcklbw xmm:modrm, xmm");
                            READMODRM; V_OP(unpack_bw, xmm_modrm_val, xmm_modrm_reg,128); break;
+                case 0x62: TRACEI("punpckldq xmm:modrm, xmm");
+                           READMODRM; V_OP(unpack_dq, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x6c: TRACEI("punpcklqdq xmm:modrm, xmm");
                            READMODRM; V_OP(unpack_qdq, xmm_modrm_val, xmm_modrm_reg,128); break;
 
@@ -326,7 +336,9 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                 case 0xef: TRACEI("pxor xmm:modrm, xmm");
                            READMODRM; V_OP(xor, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0xf3: TRACEI("psllq xmm:modrm, xmm");
-                           READMODRM; V_OP(shiftl_q, xmm_modrm_val, xmm_modrm_reg, 128); break;
+                           READMODRM; V_OP(shiftl_q, xmm_modrm_val, xmm_modrm_reg,128); break;
+                case 0xfb: TRACEI("psubq xmm:modrm, xmm");
+                           READMODRM; V_OP(sub_q, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0xfc: TRACEI("paddb xmm:modrm, xmm");
                            READMODRM; V_OP(add_b, xmm_modrm_val, xmm_modrm_reg,128); break;
 #else
@@ -335,6 +347,15 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                 case 0x11: TRACEI("movups xmm, xmm:modrm");
                            READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break;
 
+                case 0x2e: TRACEI("ucomiss xmm, xmm:modrm");
+                           READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,32); break;
+                case 0x2f: TRACEI("comiss xmm, xmm:modrm");
+                           READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,32); break;
+
+                case 0x54: TRACEI("andps xmm:modrm, xmm");
+                           READMODRM; V_OP(and, xmm_modrm_val, xmm_modrm_reg,128); break;
+                case 0x55: TRACEI("andnps xmm:modrm, xmm");
+                           READMODRM; V_OP(andn, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x56: TRACEI("orps xmm:modrm, xmm");
                            READMODRM; V_OP(or, xmm_modrm_val, xmm_modrm_reg,128); break;
                 case 0x57: TRACEI("xorps xmm:modrm, xmm");
@@ -940,10 +961,15 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) {
                                    READMODRM; V_OP(single_fsub, xmm_modrm_val, xmm_modrm_reg,64); break;
                         case 0x5e: TRACEI("divsd xmm:modrm, xmm");
                                    READMODRM; V_OP(single_fdiv, xmm_modrm_val, xmm_modrm_reg,64); break;
+                        case 0x5f: TRACEI("maxsd xmm:modrm, xmm");
+                                   READMODRM; V_OP(single_fmax, xmm_modrm_val, xmm_modrm_reg,64); break;
 
                         case 0x70: TRACEI("pshuflw xmm:modrm, xmm, imm8");
                                    READMODRM; READIMM8; V_OP_IMM(shuffle_lw, xmm_modrm_val, xmm_modrm_reg,128); break;
 
+                        case 0xc2: TRACEI("cmpsd xmm:modrm, xmm, imm8");
+                                   READMODRM; READIMM8; V_OP_IMM(single_fcmp, xmm_modrm_val, xmm_modrm_reg,64); break;
+
                         case 0x18 ... 0x1f: TRACEI("rep nop modrm\t"); READMODRM; break;
                         default: TRACE("undefined"); UNDEFINED;
                     }

diff --git a/emu/vec.c b/emu/vec.c
@@ -4,34 +4,6 @@
 #include "emu/vec.h"
 #include "emu/cpu.h"
 
-void vec_compare32(struct cpu_state *cpu, float *f2, float *f1) {
-    if (isnan(*f1) || isnan(*f2)) {
-        cpu->zf = 1;
-        cpu->pf = 1;
-        cpu->cf = 1;
-    }
-    else if (*f1 > *f2) {
-        cpu->zf = 0;
-        cpu->pf = 0;
-        cpu->cf = 0;
-    }
-    else if (*f1 < *f2) {
-        cpu->zf = 0;
-        cpu->pf = 0;
-        cpu->cf = 1;
-    }
-    else if (*f1 == *f2) {
-        cpu->zf = 1;
-        cpu->pf = 0;
-        cpu->cf = 0;
-    }
-    else {
-        printf("something's horribly wrong. err 1093281094");
-    }
-    cpu->zf_res = 0;
-    cpu->pf_res = 0;
-}
-
 static inline void zero_xmm(union xmm_reg *xmm) {
     xmm->qw[0] = 0;
     xmm->qw[1] = 0;
@@ -98,6 +70,19 @@ void vec_shiftr_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst) {
     }
 }
 
+void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
+    for (unsigned i = 0; i < array_size(src->u8); i++)
+        dst->u8[i] += src->u8[i];
+}
+void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
+    dst->qw[0] += src->qw[0];
+    dst->qw[1] += src->qw[1];
+}
+void vec_sub_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
+    dst->qw[0] -= src->qw[0];
+    dst->qw[1] -= src->qw[1];
+}
+
 void vec_and128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
     dst->qw[0] &= src->qw[0];
     dst->qw[1] &= src->qw[1];
@@ -110,14 +95,9 @@ void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
     dst->qw[0] ^= src->qw[0];
     dst->qw[1] ^= src->qw[1];
 }
-
-void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
-    for (unsigned i = 0; i < array_size(src->u8); i++)
-        dst->u8[i] += src->u8[i];
-}
-void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
-    dst->qw[0] += src->qw[0];
-    dst->qw[1] += src->qw[1];
+void vec_andn128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
+    dst->qw[0] = ~dst->qw[0] & src->qw[0];
+    dst->qw[1] = ~dst->qw[1] & src->qw[1];
 }
 
 void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
@@ -126,6 +106,22 @@ void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) {
             dst->u8[i] = src->u8[i];
 }
 
+static bool cmpd(double a, double b, int type) {
+    bool res;
+    switch (type % 4) {
+        case 0: res = a == b; break;
+        case 1: res = a < b; break;
+        case 2: res = a <= b; break;
+        case 3: res = isnan(a) || isnan(b); break;
+    }
+    if (type >= 4) res = !res;
+    return res;
+}
+
+void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type) {
+    dst->qw[0] = cmpd(dst->f64[0], *src, type) ? -1 : 0;
+}
+
 void vec_single_fadd64(NO_CPU, const double *src, double *dst) { *dst += *src; }
 void vec_single_fmul64(NO_CPU, const double *src, double *dst) { *dst *= *src; }
 void vec_single_fsub64(NO_CPU, const double *src, double *dst) { *dst -= *src; }
@@ -135,15 +131,42 @@ void vec_single_fmul32(NO_CPU, const float *src, float *dst) { *dst *= *src; }
 void vec_single_fsub32(NO_CPU, const float *src, float *dst) { *dst -= *src; }
 void vec_single_fdiv32(NO_CPU, const float *src, float *dst) { *dst /= *src; }
 
+void vec_single_fmax64(NO_CPU, const double *src, double *dst) {
+    if (*src > *dst || isnan(*src) || isnan(*dst)) *dst = *src;
+}
+
+void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst) {
+    cpu->zf_res = cpu->pf_res = 0;
+    cpu->zf = *src == *dst;
+    cpu->cf = *src > *dst;
+    cpu->pf = 0;
+    if (isnan(*src) || isnan(*dst))
+        cpu->zf = cpu->cf = cpu->pf = 1;
+    cpu->of = cpu->sf = cpu->af = 0;
+    cpu->sf_res = 0;
+}
+
+void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst) {
+    cpu->zf_res = cpu->pf_res = 0;
+    cpu->zf = *src == *dst;
+    cpu->cf = *src > *dst;
+    cpu->pf = 0;
+    if (isnan(*src) || isnan(*dst))
+        cpu->zf = cpu->cf = cpu->pf = 1;
+    cpu->of = cpu->sf = cpu->af = 0;
+    cpu->sf_res = 0;
+}
+
+// TODO float edge cases e.g. nan
 #define VEC_CVT(name, src_t, dst_t) \
     void vec_cvt##name(NO_CPU, const src_t *src, dst_t *dst) { \
         *dst = *src; \
     }
-VEC_CVT(si2sd32, uint32_t, double)
-VEC_CVT(tsd2si64, double, uint32_t)
+VEC_CVT(si2sd32, int32_t, double)
+VEC_CVT(tsd2si64, double, int32_t)
 VEC_CVT(sd2ss64, double, float)
-VEC_CVT(si2ss32, uint32_t, float)
-VEC_CVT(tss2si32, float, uint32_t)
+VEC_CVT(si2ss32, int32_t, float)
+VEC_CVT(tss2si32, float, int32_t)
 VEC_CVT(ss2sd32, float, double)
 
 void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
@@ -152,6 +175,11 @@ void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
         dst->u8[i*2] = dst->u8[i];
     }
 }
+void vec_unpack_dq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
+    dst->u32[3] = src->u32[1];
+    dst->u32[2] = dst->u32[1];
+    dst->u32[1] = src->u32[0];
+}
 void vec_unpack_qdq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) {
     dst->qw[1] = src->qw[0];
 }
@@ -185,6 +213,14 @@ void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst) {
     }
 }
 
+void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst) {
+    *dst = 0;
+    for (unsigned i = 0; i < array_size(src->f64); i++) {
+        if (signbit(src->f64[i]))
+            *dst |= 1 << i;
+    }
+}
+
 void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index) {
     *dst = src->u16[index % 8];
 }
diff --git a/emu/vec.h b/emu/vec.h
@@ -4,7 +4,6 @@
 #include "emu/cpu.h"
 
 #define NO_CPU struct cpu_state *UNUSED(cpu)
-void vec_compare32(NO_CPU, float *f2, float *f1);
 
 // arguments are in src, dst order
 
@@ -22,14 +21,17 @@ void vec_imm_shiftl_q128(NO_CPU, const uint8_t amount, union xmm_reg *dst);
 void vec_imm_shiftr_q128(NO_CPU, const uint8_t amount, union xmm_reg *dst);
 void vec_shiftl_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst);
 void vec_shiftr_q128(NO_CPU, union xmm_reg *amount, union xmm_reg *dst);
+void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
+void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
+void vec_sub_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
 void vec_and128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
+void vec_andn128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
 void vec_or128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
 void vec_xor128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
-void vec_add_b128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
-void vec_add_q128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
 
 void vec_min_ub128(NO_CPU, union xmm_reg *src, union xmm_reg *dst);
 
+
 void vec_single_fadd64(NO_CPU, const double *src, double *dst);
 void vec_single_fmul64(NO_CPU, const double *src, double *dst);
 void vec_single_fsub64(NO_CPU, const double *src, double *dst);
@@ -39,21 +41,28 @@ void vec_single_fmul32(NO_CPU, const float *src, float *dst);
 void vec_single_fsub32(NO_CPU, const float *src, float *dst);
 void vec_single_fdiv32(NO_CPU, const float *src, float *dst);
 
-void vec_cvtsi2sd32(NO_CPU, const uint32_t *src, double *dst);
-void vec_cvttsd2si64(NO_CPU, const double *src, uint32_t *dst);
+void vec_single_fmax64(NO_CPU, const double *src, double *dst);
+void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst);
+void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst);
+void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type);
+
+void vec_cvtsi2sd32(NO_CPU, const int32_t *src, double *dst);
+void vec_cvttsd2si64(NO_CPU, const double *src, int32_t *dst);
 void vec_cvtsd2ss64(NO_CPU, const double *src, float *dst);
-void vec_cvtsi2ss32(NO_CPU, const uint32_t *src, float *dst);
-void vec_cvttss2si32(NO_CPU, const float *src, uint32_t *dst);
+void vec_cvtsi2ss32(NO_CPU, const int32_t *src, float *dst);
+void vec_cvttss2si32(NO_CPU, const float *src, int32_t *dst);
 void vec_cvtss2sd32(NO_CPU, const float *src, double *dst);
 
 // TODO organize
 void vec_unpack_bw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
+void vec_unpack_dq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
 void vec_unpack_qdq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
 void vec_shuffle_lw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
 void vec_shuffle_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding);
 void vec_compare_eqb128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
 void vec_compare_eqd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst);
 void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst);
+void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst);
 void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index);
 
 #endif