diff --git a/src/Native/src/kernels/stackvm/optimized/x86_64/unary.cpp b/src/Native/src/kernels/stackvm/optimized/x86_64/unary.cpp
index 78bedf4457..01d5d1af4e 100644
--- a/src/Native/src/kernels/stackvm/optimized/x86_64/unary.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/x86_64/unary.cpp
@@ -104,13 +104,30 @@ struct unary_op_neg {
     }
 };
 
+static float round_onnx(float v) {
+    if (v > 0 && v - (int32_t)v == 0.5) {
+        float result = (int32_t)v + 1.0;
+        if ((int32_t)result % 2 == 0)
+            return result;
+        else
+            return result - 1;
+    } else if (v < 0 && (int32_t)v - v == 0.5) {
+        float result = (int32_t)v + 1.0;
+        if ((int32_t)result % 2 == 0)
+            return result;
+        else
+            return result - 1;
+    } else
+        return roundf(v);
+}
+
 struct unary_op_round {
-    float operator()(float x) const { return roundf(x); }
+    float operator()(float x) const { return round_onnx(x); }
 
     void pack(const float *a, float *b) {
         __m256 vector_a = _mm256_loadu_ps(a);
         __m256 dst_a = _mm256_round_ps(
-            vector_a, (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC));
+            vector_a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
         _mm256_storeu_ps(b, dst_a);
     }
 };
diff --git a/src/Native/src/kernels/stackvm/reference/unary.cpp b/src/Native/src/kernels/stackvm/reference/unary.cpp
index 9d8e0dba39..f9da11ba02 100644
--- a/src/Native/src/kernels/stackvm/reference/unary.cpp
+++ b/src/Native/src/kernels/stackvm/reference/unary.cpp
@@ -45,6 +45,23 @@ result<void> unary_impl(TOp &&op, const T *input, T *output,
         return unary_impl(funct, input, output, input_shape, input_strides,    \
                           out_shape, out_strides, context)
 
+static float round_onnx(float v) {
+    if (v > 0 && v - (int32_t)v == 0.5) {
+        float result = (int32_t)v + 1.0;
+        if ((int32_t)result % 2 == 0)
+            return result;
+        else
+            return result - 1;
+    } else if (v < 0 && (int32_t)v - v == 0.5) {
+        float result = (int32_t)v + 1.0;
+        if ((int32_t)result % 2 == 0)
+            return result;
+        else
+            return result - 1;
+    } else
+        return roundf(v);
+}
+
 template <class T>
 result<void> unary_impl(unary_op_t op, const T *input, T *output,
                         gsl::span<const size_t> input_shape,
@@ -66,7 +83,7 @@ result<void> unary_impl(unary_op_t op, const T *input, T *output,
         UNARY_IMPL_OP(log, logf);
         UNARY_IMPL_OP(logical_not, [](float v) { return !v; });
         UNARY_IMPL_OP(neg, std::negate<float>());
-        UNARY_IMPL_OP(round, roundf);
+        UNARY_IMPL_OP(round, [](float v) { return round_onnx(v); });
         UNARY_IMPL_OP(rsqrt, [](float v) { return 1.f / sqrtf(v); });
         UNARY_IMPL_OP(sign, [](float v) { return (0.f < v) - (v < 0.f); });
         UNARY_IMPL_OP(sin, sinf);
diff --git a/tests/kernels/test_gather_elements.cpp b/tests/kernels/test_gather_elements.cpp
new file mode 100644
index 0000000000..1a17edaecf
--- /dev/null
+++ b/tests/kernels/test_gather_elements.cpp
@@ -0,0 +1,119 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel_test.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <nncase/kernels/stackvm/tensor_ops.h>
+#include <nncase/runtime/datatypes.h>
+#include <nncase/runtime/runtime_tensor.h>
+#include <nncase/runtime/simple_types.h>
+#include <nncase/runtime/stackvm/opcode.h>
+#include <ortki/operators.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace ortki;
+
+class GatherElementsTest
+    : public KernelTest,
+      public ::testing::TestWithParam<
+          std::tuple<nncase::typecode_t, dims_t, int64_t>> {
+  public:
+    void SetUp() override {
+        auto &&[typecode, shape, value] = GetParam();
+
+        input = hrt::create(typecode, shape, host_runtime_tensor::pool_cpu_only)
+                    .expect("create tensor failed");
+        init_tensor(input);
+
+        int64_t indices_array[] = {0, 0, 1, 1};
+        indices = hrt::create(dt_int64, {2, 2},
+                              {reinterpret_cast<gsl::byte *>(indices_array),
+                               sizeof(indices_array)},
+                              true, host_runtime_tensor::pool_cpu_only)
+                      .expect("create tensor failed");
+
+        batchDims_value = value;
+        int64_t batchDims_array[1] = {value};
+        batchDims = hrt::create(dt_int64, dims_t{1},
+                                {reinterpret_cast<gsl::byte *>(batchDims_array),
+                                 sizeof(batchDims_array)},
+                                true, host_runtime_tensor::pool_cpu_only)
+                        .expect("create tensor failed");
+    }
+
+    void TearDown() override {}
+
+  protected:
+    runtime_tensor input;
+    runtime_tensor indices;
+    runtime_tensor batchDims;
+    int64_t batchDims_value;
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    gather_elements, GatherElementsTest,
+    testing::Combine(testing::Values(dt_int32, dt_int64, dt_float32, dt_uint64,
+                                     dt_int8, dt_int16, dt_uint8, dt_uint16,
+                                     dt_uint32, dt_float16, dt_float64,
+                                     dt_bfloat16, dt_boolean),
+                     testing::Values(dims_t{
+                         2,
+                         2} /*, dims_t{3, 5},
+                dims_t{2, 3, 1}, dims_t{5, 7, 5},
+                dims_t{5, 4, 3, 2}, dims_t{5, 5, 7, 7},
+                dims_t{2, 3, 3, 5}*/),
+                     testing::Values(-1, 0, 1)));
+
+TEST_P(GatherElementsTest, gather_elements) {
+    auto input_ort = runtime_tensor_2_ort_tensor(input);
+    auto indices_ort = runtime_tensor_2_ort_tensor(indices);
+
+    // expected
+    auto output_ort =
+        ortki_GatherElements(input_ort, indices_ort, batchDims_value);
+    size_t size = 0;
+    void *ptr_ort = tensor_buffer(output_ort, &size);
+    dims_t shape(tensor_rank(output_ort));
+    tensor_shape(output_ort, reinterpret_cast<int64_t *>(shape.data()));
+    auto expected = hrt::create(input.datatype(), shape,
+                                {reinterpret_cast<gsl::byte *>(ptr_ort), size},
+                                true, host_runtime_tensor::pool_cpu_only)
+                        .expect("create tensor failed");
+
+    // actual
+    auto output = kernels::stackvm::gather_elements(
+                      input.impl(), batchDims.impl(), indices.impl())
+                      .expect("gather failed");
+    runtime_tensor actual(output.as<tensor>().expect("as tensor failed"));
+
+    bool result = is_same_tensor(expected, actual) ||
+                  cosine_similarity_tensor(expected, actual);
+
+    if (!result) {
+        std::cout << "actual ";
+        print_runtime_tensor(actual);
+        std::cout << "expected ";
+        print_runtime_tensor(expected);
+    }
+
+    // compare
+    EXPECT_TRUE(result);
+}
+
+int main(int argc, char *argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/tests/kernels/test_unary_round.cpp b/tests/kernels/test_unary_round.cpp
index bfba4db767..cda1a27980 100644
--- a/tests/kernels/test_unary_round.cpp
+++ b/tests/kernels/test_unary_round.cpp
@@ -54,7 +54,7 @@ INSTANTIATE_TEST_SUITE_P(
                                      dims_t{16, 1}, dims_t{1, 16, 1},
                                      dims_t{16}, dims_t{1}, dims_t{})));
 
-TEST_P(UnaryTest, roound) {
+TEST_P(UnaryTest, round) {
     OrtKITensor *orts[1];
     orts[0] = runtime_tensor_2_ort_tensor(input);
 
@@ -79,6 +79,7 @@ TEST_P(UnaryTest, roound) {
                   cosine_similarity_tensor(expected, actual);
 
     if (!result) {
+        print_runtime_tensor(input);
         std::cout << "actual ";
         print_runtime_tensor(actual);
         std::cout << "expected ";