From 31e406e1a44af36bea83ca80485f45ef482c3d97 Mon Sep 17 00:00:00 2001 From: "L. Zhang" Date: Mon, 3 Jun 2019 02:41:51 -0700 Subject: [PATCH] Add sad_loop_kernel() unit test --- test/EbUnitTestUtility.c | 48 ++++-- test/EbUnitTestUtility.h | 4 + test/MotionEstimationTest.cc | 297 ++++++++++++++++++++++++++++++++++- test/PictureOperatorsTest.cc | 8 +- test/RestorationPickTest.cc | 8 +- 5 files changed, 347 insertions(+), 18 deletions(-) diff --git a/test/EbUnitTestUtility.c b/test/EbUnitTestUtility.c index 21162e9f6e..930b6b473c 100644 --- a/test/EbUnitTestUtility.c +++ b/test/EbUnitTestUtility.c @@ -24,6 +24,7 @@ #endif #include +#include #include #include #include "EbUnitTest.h" @@ -59,13 +60,40 @@ void eb_buf_random_u8(uint8_t *const buf, const uint32_t sizeBuf) { void eb_buf_random_u8_to_0_or_255(uint8_t *const buf, const uint32_t sizeBuf) { for (uint32_t i = 0; i < sizeBuf; i++) - buf[i] = (rand() >(RAND_MAX >> 1)) ? 255 : 0; + buf[i] = (rand() > (RAND_MAX >> 1)) ? 255 : 0; } void eb_buf_random_u8_to_255(uint8_t *const buf, const uint32_t sizeBuf) { memset(buf, 255, sizeBuf); } +void eb_buf_random_u8_to_large(uint8_t *const buf, const uint32_t sizeBuf) { + for (uint32_t i = 0; i < sizeBuf; i++) + buf[i] = 255 - (rand() % 10); +} + +void eb_buf_random_u8_to_near_value(uint8_t *const buf, const uint32_t sizeBuf, + const uint8_t val, const uint32_t range) { + for (uint32_t i = 0; i < sizeBuf; i++) { + int32_t v = val; + v += (rand() % range); + v -= (rand() % range); + if (v > 255) v = 255; + if (v < 0) v = 0; + buf[i] = (uint8_t)v; + } +} + +void eb_buf_random_u8_to_small(uint8_t *const buf, const uint32_t sizeBuf) { + for (uint32_t i = 0; i < sizeBuf; i++) + buf[i] = (rand() % 10); +} + +void eb_buf_random_u8_to_small_or_large(uint8_t *const buf, const uint32_t sizeBuf) { + for (uint32_t i = 0; i < sizeBuf; i++) + buf[i] = (rand() > (RAND_MAX >> 1)) ? (rand() % 10) : (255 - (rand() % 10)); +} + void eb_buf_random_u8_with_max(uint8_t *const buf, const uint32_t sizeBuf, const uint8_t max) { for (uint32_t i = 0; i < sizeBuf; i++) @@ -232,8 +260,10 @@ EbBool name(const type *buf1, const type *buf2, const size_t width, i++, buf1 += stride, buf2 += stride) { \ for (uint32_t j = 0; j < stride; j++) { \ if (buf1[j] != buf2[j]) { \ - printf("\nbuf1[%3d][%3d] = 0x%16llx\tbuf2[%3d][%3d] = 0x%16llx" \ - ,i, j, (uint64_t)buf1[j], i, j, (uint64_t)buf2[j]); \ + printf("\nbuf1[%3d][%3d] = 0x%16" PRIx64 \ + "\tbuf2[%3d][%3d] = 0x%16" PRIx64, \ + i, j, (uint64_t)buf1[j], i, j, \ + (uint64_t)buf2[j]); \ if ((i < 0) || (i >= (int32_t)height) || (j >= width)) \ printf(" (outside image)"); \ result = EB_FALSE; \ @@ -344,10 +374,10 @@ void eb_unit_test_log_s64(const char *const nameBuf, const int64_t *const buf, const uint32_t sizeBuf) { printf("%16s = ", nameBuf); - if (sizeBuf == 1) printf("%lld\n", buf[0]); + if (sizeBuf == 1) printf("%" PRIx64 "\n", buf[0]); else { for (uint32_t i = 0; i < sizeBuf; i++) { - printf("%10lld,", buf[i]); + printf("%10" PRIx64 ",", buf[i]); if (!((i + 1) % 32)) printf("\n "); } printf("\n"); @@ -358,10 +388,10 @@ void eb_unit_test_log_u64(const char *const nameBuf, const uint64_t *const buf, const uint32_t sizeBuf) { printf("%16s = ", nameBuf); - if (sizeBuf == 1) printf("%llu\n", buf[0]); + if (sizeBuf == 1) printf("%" PRIu64 "\n", buf[0]); else { for (uint32_t i = 0; i < sizeBuf; i++) { - printf("%10llu,", buf[i]); + printf("%10" PRIu64 ",", buf[i]); if (!((i + 1) % 32)) printf("\n "); } printf("\n"); @@ -372,10 +402,10 @@ void eb_unit_test_log_ptrdiff(const char *const nameBuf, const ptrdiff_t *const buf, const uint32_t sizeBuf) { printf("%16s = ", nameBuf); - if (sizeBuf == 1) printf("%llu\n", buf[0]); + if (sizeBuf == 1) printf("%" PRIu64 "\n", buf[0]); else { for (uint32_t i = 0; i < sizeBuf; i++) { - printf("%10llu,", buf[i]); + printf("%10" PRIu64 ",", buf[i]); if (!((i + 1) % 32)) printf("\n "); } printf("\n"); diff --git a/test/EbUnitTestUtility.h b/test/EbUnitTestUtility.h index dc0262263e..2f53d1009e 100644 --- a/test/EbUnitTestUtility.h +++ b/test/EbUnitTestUtility.h @@ -25,6 +25,10 @@ extern void eb_buf_random_s8_with_max(int8_t *const buf, const uint32_t sizeBuf, extern void eb_buf_random_u8(uint8_t *const buf, const uint32_t sizeBuf); extern void eb_buf_random_u8_to_0_or_255(uint8_t *const buf, const uint32_t sizeBuf); extern void eb_buf_random_u8_to_255(uint8_t *const buf, const uint32_t sizeBuf); +extern void eb_buf_random_u8_to_large(uint8_t *const buf, const uint32_t sizeBuf); +extern void eb_buf_random_u8_to_near_value(uint8_t *const buf, const uint32_t sizeBuf, const uint8_t val, const uint32_t range); +extern void eb_buf_random_u8_to_small(uint8_t *const buf, const uint32_t sizeBuf); +extern void eb_buf_random_u8_to_small_or_large(uint8_t *const buf, const uint32_t sizeBuf); extern void eb_buf_random_u8_with_max(uint8_t *const buf, const uint32_t sizeBuf, const uint8_t max); extern void eb_buf_random_s16(int16_t *const buf, const uint32_t sizeBuf); extern void eb_buf_random_s16_to_bd(int16_t *const buf, const uint32_t sizeBuf, const uint32_t bd); diff --git a/test/MotionEstimationTest.cc b/test/MotionEstimationTest.cc index 51bb9014d6..adbfad8cdd 100644 --- a/test/MotionEstimationTest.cc +++ b/test/MotionEstimationTest.cc @@ -4,15 +4,310 @@ */ #include "gtest/gtest.h" +#include "aom_dsp_rtcd.h" #include "EbDefinitions.h" #include "EbComputeSAD_AVX2.h" +#include "EbComputeSAD_AVX512.h" +#include "EbComputeSAD_C.h" #include "EbComputeSAD_SSE4_1.h" #include "EbMeSadCalculation.h" #include "EbMotionEstimation.h" #include "EbUnitTest.h" #include "EbUnitTestUtility.h" -static const int max_ref_stride = 800; +static const int num_test = 28; +static const int num_sad = 6; + +struct DistInfo { + uint32_t width; + uint32_t height; +}; + +const struct DistInfo opt_sad_size_info[num_sad] = { + { 64, 16 }, + { 64, 32 }, + { 64, 64 }, + { 64, 128 }, + { 128, 64 }, + { 128, 128 } +}; + +typedef uint32_t(*aom_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + +typedef uint32_t(*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); + +typedef void(*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, uint32_t *sad_array); + +aom_sad_fn_t aom_sad_func_ptr_array[num_sad][3] = { + { aom_sad64x16_c, aom_sad64x16_avx2, aom_sad64x16_avx512 }, + { aom_sad64x32_c, aom_sad64x32_avx2, aom_sad64x32_avx512 }, + { aom_sad64x64_c, aom_sad64x64_avx2, aom_sad64x64_avx512 }, + { aom_sad64x128_c, aom_sad64x128_avx2, aom_sad64x128_avx512 }, + { aom_sad128x64_c, aom_sad128x64_avx2, aom_sad128x64_avx512 }, + { aom_sad128x128_c, aom_sad128x128_avx2, aom_sad128x128_avx512 } +}; + +aom_sad_multi_d_fn_t aom_sad_4d_func_ptr_array[num_sad][3] = { + { aom_sad64x16x4d_c, aom_sad64x16x4d_avx2, aom_sad64x16x4d_avx2 }, + { aom_sad64x32x4d_c, aom_sad64x32x4d_avx2, aom_sad64x32x4d_avx2 }, + { aom_sad64x64x4d_c, aom_sad64x64x4d_avx2, aom_sad64x64x4d_avx2 }, + { aom_sad64x128x4d_c, aom_sad64x128x4d_avx2, aom_sad64x128x4d_avx2 }, + { aom_sad128x64x4d_c, aom_sad128x64x4d_avx2, aom_sad128x64x4d_avx512 }, + { aom_sad128x128x4d_c, aom_sad128x128x4d_avx2, aom_sad128x128x4d_avx512 } +}; + +static void init_data_sadMxN(uint8_t **src_ptr, uint32_t *src_stride, uint8_t **ref_ptr, + uint32_t *ref_stride) +{ + *src_stride = eb_create_random_aligned_stride(MAX_SB_SIZE, 64); + *ref_stride = eb_create_random_aligned_stride(MAX_SB_SIZE, 64); + *src_ptr = (uint8_t*)malloc(sizeof(**src_ptr) * MAX_SB_SIZE * *src_stride); + *ref_ptr = (uint8_t*)malloc(sizeof(**ref_ptr) * MAX_SB_SIZE * *ref_stride); + eb_buf_random_u8(*src_ptr, MAX_SB_SIZE * *src_stride); + eb_buf_random_u8(*ref_ptr, MAX_SB_SIZE * *ref_stride); +} + +static void init_data_sadMxNx4d(uint8_t **src_ptr, uint32_t *src_stride, uint8_t *ref_ptr[4], + uint32_t *ref_stride) +{ + *src_stride = eb_create_random_aligned_stride(MAX_SB_SIZE, 64); + *ref_stride = eb_create_random_aligned_stride(MAX_SB_SIZE, 64); + *src_ptr = (uint8_t*)malloc(sizeof(**src_ptr) * MAX_SB_SIZE * *src_stride); + ref_ptr[0] = (uint8_t*)malloc(sizeof(**ref_ptr) * (MAX_SB_SIZE + 3) * *ref_stride); + eb_buf_random_u8(*src_ptr, MAX_SB_SIZE * *src_stride); + eb_buf_random_u8(ref_ptr[0], (MAX_SB_SIZE + 3) * *ref_stride); + ref_ptr[1] = ref_ptr[0] + *ref_stride; + ref_ptr[2] = ref_ptr[1] + *ref_stride; + ref_ptr[3] = ref_ptr[2] + *ref_stride; +} + +static void uninit_data(uint8_t *src_ptr, uint8_t *ref_ptr) +{ + free(src_ptr); + free(ref_ptr); +} + +TEST(MotionEstimation, sadMxN) +{ + uint8_t *src_ptr, *ref_ptr; + uint32_t src_stride, ref_stride; + + for (int i = 0; i < 10; i++) { + init_data_sadMxN(&src_ptr, &src_stride, &ref_ptr, &ref_stride); + + for (int j = 0; j < num_sad; j++) { + for (int k = 1; k < 3; k++) { + const uint32_t sad_org = aom_sad_func_ptr_array[j][0](src_ptr, src_stride, ref_ptr, ref_stride); + const uint32_t sad_opt = aom_sad_func_ptr_array[j][k](src_ptr, src_stride, ref_ptr, ref_stride); + + EXPECT_EQ(sad_org, sad_opt); + } + } + + uninit_data(src_ptr, ref_ptr); + } +} + +TEST(MotionEstimation, sadMxNx4d) +{ + uint8_t *src_ptr, *ref_ptr[4]; + uint32_t src_stride, ref_stride; + uint32_t sad_array_org[4], sad_array_opt[4]; + + for (int i = 0; i < 10; i++) { + init_data_sadMxNx4d(&src_ptr, &src_stride, ref_ptr, &ref_stride); + + for (int j = 0; j < num_sad; j++) { + for (int k = 1; k < 3; k++) { + eb_buf_random_u32(sad_array_opt, 4); + aom_sad_4d_func_ptr_array[j][0](src_ptr, src_stride, ref_ptr, ref_stride, sad_array_org); + aom_sad_4d_func_ptr_array[j][k](src_ptr, src_stride, ref_ptr, ref_stride, sad_array_opt); + + for (int l = 0; l < 4; l++) { + EXPECT_EQ(sad_array_org[l], sad_array_opt[l]); + } + } + } + + uninit_data(src_ptr, ref_ptr[0]); + } +} + +// =================================== + +static const int max_ref_stride = 512; + +static const struct DistInfo sad_loop_size_info[num_test] = { + { 4, 2 }, + { 4, 4 }, + { 4, 8 }, + { 4, 16 }, + { 8, 2 }, + { 8, 4 }, + { 8, 8 }, + { 8, 16 }, + { 8, 32 }, + { 16, 4 }, + { 16, 8 }, + { 16, 12 }, + { 16, 16 }, + { 16, 32 }, + { 16, 64 }, + { 24, 16 }, + { 24, 32 }, + { 32, 8 }, + { 32, 16 }, + { 32, 24 }, + { 32, 32 }, + { 32, 64 }, + { 48, 32 }, + { 48, 64 }, + { 64, 16 }, + { 64, 32 }, + { 64, 48 }, + { 64, 64 }, +}; + +static void init_data_sad_loop_kernel(uint8_t *const src, const int32_t size_of_src, uint8_t *const ref, + const int32_t size_of_ref, const int idx) +{ + if (!idx) { + memset(src, 0, size_of_src); + memset(ref, 0, size_of_ref); + } + else if (1 == idx) { + eb_buf_random_u8_to_255(src, size_of_src); + eb_buf_random_u8_to_255(ref, size_of_ref); + } + else if (2 == idx) { + memset(src, 0, size_of_src); + eb_buf_random_u8_to_255(ref, size_of_ref); + } + else if (3 == idx) { + eb_buf_random_u8_to_255(src, size_of_src); + memset(ref, 0, size_of_ref); + } + else if (4 == idx) { + eb_buf_random_u8_to_0_or_255(src, size_of_src); + eb_buf_random_u8_to_0_or_255(ref, size_of_ref); + } + else if (!(idx % 4)) { + eb_buf_random_u8_to_small(src, size_of_src); + eb_buf_random_u8_to_large(ref, size_of_ref); + } + else if (1 == (idx % 4)) { + eb_buf_random_u8_to_small_or_large(src, size_of_src); + eb_buf_random_u8_to_small_or_large(ref, size_of_ref); + } + else if (2 == (idx % 4)) { + const uint32_t range = 16; + uint8_t val; + eb_buf_random_u8(&val, 1); + eb_buf_random_u8_to_near_value(src, size_of_src, val, range); + eb_buf_random_u8_to_near_value(ref, size_of_ref, val, range); + } + else { + eb_buf_random_u8(src, size_of_src); + eb_buf_random_u8(ref, size_of_ref); + } +} + +TEST(MotionEstimation, sad_loop_kernel) +{ + const uint32_t src_stride = 256; + const uint32_t ref_stride = max_ref_stride; + uint8_t src[MAX_SB_SIZE * src_stride], ref[200 * max_ref_stride]; + uint32_t ref_stride_raw = ref_stride; + uint64_t best_sad_org, best_sad_opt; + int16_t x_search_center_org = 0, x_search_center_opt = 0; + int16_t y_search_center_org = 0, y_search_center_opt = 0; + const int16_t search_area_height = 64; + + for (int i = 0; i < 10; i++) { + init_data_sad_loop_kernel(src, sizeof(src), ref, sizeof(ref), i); + + for (int j = 0; j < num_test; j++) { + const uint32_t width = sad_loop_size_info[j].width; + const uint32_t height = sad_loop_size_info[j].height; + for (int16_t search_area_width = 1; search_area_width <= 32; search_area_width++) { + sad_loop_kernel(src, src_stride, ref, ref_stride, height, width, &best_sad_org, &x_search_center_org, &y_search_center_org, ref_stride_raw, search_area_width, search_area_height); + sad_loop_kernel_avx512_intrin(src, src_stride, ref, ref_stride, height, width, &best_sad_opt, &x_search_center_opt, &y_search_center_opt, ref_stride_raw, search_area_width, search_area_height); + + if ((best_sad_org != best_sad_opt) || (x_search_center_org != x_search_center_opt) || (y_search_center_org != y_search_center_opt)) { + printf("[%d, %d], search_area_width = %d\n", width, height, search_area_width); + } + + EXPECT_EQ(best_sad_org, best_sad_opt); + EXPECT_EQ(x_search_center_org, x_search_center_opt); + EXPECT_EQ(y_search_center_org, y_search_center_opt); + } + } + } +} + +#if 0 +TEST(MotionEstimation, sad_loop_kernel_speed) +{ + const uint32_t src_stride = 256; + const uint32_t ref_stride = max_ref_stride; + uint8_t src[MAX_SB_SIZE * src_stride], ref[200 * max_ref_stride]; + uint32_t ref_stride_raw = ref_stride; + uint64_t best_sad_org, best_sad_opt; + int16_t x_search_center_org = 0, x_search_center_opt = 0; + int16_t y_search_center_org = 0, y_search_center_opt = 0; + int16_t search_area_width = 64, search_area_height = 64; + double time_c, time_o; + uint64_t start_time_seconds, start_time_useconds; + uint64_t middle_time_seconds, middle_time_useconds; + uint64_t finish_time_seconds, finish_time_useconds; + + printf("%40s", "sad_loop_kernel\n"); + + eb_buf_random_u8(src, sizeof(src)); + eb_buf_random_u8(ref, sizeof(ref)); + + for (int j = 0; j < num_test; j++) { + const uint32_t width = sad_loop_size_info[j].width; + const uint32_t height = sad_loop_size_info[j].height; + const uint64_t num_loop = 10000000 / (width + height); + + EbStartTime(&start_time_seconds, &start_time_useconds); + + for (uint64_t i = 0; i < num_loop; i++) { + sad_loop_kernel_avx2_intrin(src, src_stride, ref, ref_stride, height, width, &best_sad_org, &x_search_center_org, &y_search_center_org, ref_stride_raw, search_area_width, search_area_height); + } + + EbStartTime(&middle_time_seconds, &middle_time_useconds); + + for (uint64_t i = 0; i < num_loop; i++) { + sad_loop_kernel_avx512_intrin(src, src_stride, ref, ref_stride, height, width, &best_sad_opt, &x_search_center_opt, &y_search_center_opt, ref_stride_raw, search_area_width, search_area_height); + } + + EbStartTime(&finish_time_seconds, &finish_time_useconds); + EbComputeOverallElapsedTimeMs(start_time_seconds, start_time_useconds, + middle_time_seconds, middle_time_useconds, &time_c); + EbComputeOverallElapsedTimeMs(middle_time_seconds, middle_time_useconds, + finish_time_seconds, finish_time_useconds, &time_o); + + EXPECT_EQ(best_sad_org, best_sad_opt); + EXPECT_EQ(x_search_center_org, x_search_center_opt); + EXPECT_EQ(y_search_center_org, y_search_center_opt); + + printf("Average Nanoseconds per Function Call\n"); + printf(" sad_loop_kernel_%2dx%2d_AVX2() : %6.2f\n", + width, height, 1000000 * time_c / num_loop); + printf(" sad_loop_kernel_%2dx%2d_AVX512() : %6.2f (Comparison: " + "%5.2fx)\n", width, height, 1000000 * time_o / num_loop, time_c / time_o); + } +} +#endif + +// =================================== static void init_context(MeContext **const context_ptr, uint8_t *const src, uint8_t *const ref, const uint32_t src_stride, const uint32_t ref_stride, const uint32_t listIndex, const uint32_t refPicIndex, const uint32_t asm_type) diff --git a/test/PictureOperatorsTest.cc b/test/PictureOperatorsTest.cc index 5dbf707f72..4c441cfd09 100644 --- a/test/PictureOperatorsTest.cc +++ b/test/PictureOperatorsTest.cc @@ -16,7 +16,7 @@ struct DistInfo { uint32_t height; }; -const struct DistInfo size_info[num_test] = { +static const struct DistInfo size_info[num_test] = { { 4, 4}, { 4, 8}, { 4, 16}, @@ -114,13 +114,13 @@ TEST(PictureOperators, spatial_full_distortion_speed) EbStartTime(&start_time_seconds, &start_time_useconds); for (uint64_t i = 0; i < num_loop; i++) { - dist_org = spatial_full_distortion_kernel_func_ptr_array[ASM_AVX2][Log2f(area_width) - 2](input, input_stride, recon, recon_stride, area_width, area_height); + dist_org = spatial_full_distortion_kernel_func_ptr_array[ASM_NON_AVX2][Log2f(area_width) - 2](input, input_stride, recon, recon_stride, area_width, area_height); } EbStartTime(&middle_time_seconds, &middle_time_useconds); for (uint64_t i = 0; i < num_loop; i++) { - dist_opt = spatial_full_distortion_kernel_func_ptr_array[ASM_AVX512][Log2f(area_width) - 2](input, input_stride, recon, recon_stride, area_width, area_height); + dist_opt = spatial_full_distortion_kernel_func_ptr_array[ASM_AVX2][Log2f(area_width) - 2](input, input_stride, recon, recon_stride, area_width, area_height); } EbStartTime(&finish_time_seconds, &finish_time_useconds); @@ -136,7 +136,7 @@ TEST(PictureOperators, spatial_full_distortion_speed) printf("Average Nanoseconds per Function Call\n"); printf(" SpatialFullDistortionKernel_%2dx%2d_SSE2() : %6.2f\n", area_width, area_height, 1000000 * time_c / num_loop); - printf(" SpatialFullDistortionKernel_%2dx%2d_AVX2() : %6.2f (Comparison: " + printf(" SpatialFullDistortionKernel_%2dx%2d_AVX512() : %6.2f (Comparison: " "%5.2fx)\n", area_width, area_height, 1000000 * time_o / num_loop, time_c / time_o); } } diff --git a/test/RestorationPickTest.cc b/test/RestorationPickTest.cc index bf98d76255..ea4b1c7ee3 100644 --- a/test/RestorationPickTest.cc +++ b/test/RestorationPickTest.cc @@ -303,9 +303,9 @@ TEST(EbRestorationPick, compute_stats_speed) } printf("Average Nanoseconds per Function Call\n"); - printf(" av1_compute_stats_c(%d) : %6.2f\n", + printf(" av1_compute_stats_avx2(%d) : %6.2f\n", wiener_win, 1000000 * time_c / num_loop); - printf(" av1_compute_stats_avx2(%d) : %6.2f (Comparison: " + printf(" av1_compute_stats_avx512(%d) : %6.2f (Comparison: " "%5.2fx)\n", wiener_win, 1000000 * time_o / num_loop, time_c / time_o); } } @@ -368,9 +368,9 @@ TEST(EbRestorationPick, compute_stats_highbd_speed) } printf("Average Nanoseconds per Function Call\n"); - printf(" av1_compute_stats_highbd_c(%d) : %6.2f\n", + printf(" av1_compute_stats_highbd_avx2(%d) : %6.2f\n", wiener_win, 1000000 * time_c / num_loop); - printf(" av1_compute_stats_highbd_avx2(%d) : %6.2f (Comparison: " + printf(" av1_compute_stats_highbd_avx512(%d) : %6.2f (Comparison: " "%5.2fx)\n", wiener_win, 1000000 * time_o / num_loop, time_c / time_o); } }