diff --git a/cbits/simd_avx2.c b/cbits/simd_avx2.c index b0e9415..361384c 100644 --- a/cbits/simd_avx2.c +++ b/cbits/simd_avx2.c @@ -39,15 +39,15 @@ void avx2_cmpeq8( #if defined(AVX2_ENABLED) uint32_t *target32 = (uint32_t *)target; - __m256 v_comparand = _mm256_set1_epi8(byte); + __m256i v_comparand = _mm256_set1_epi8(byte); uint32_t *out_mask = (uint32_t*)target; size_t i; for (i = 0; i < target_length * 2; ++i) { - __m256 v_data_a = *(__m256*)(source + (i * 32)); - __m256 v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand); + __m256i v_data_a = *(__m256i *)(source + (i * 32)); + __m256i v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand); uint32_t mask = (uint32_t)_mm256_movemask_epi8(v_results_a); target32[i] = mask; } @@ -63,10 +63,10 @@ void avx2_and_bits( size_t i; for (i = 0; i < target_length; i += 32) { - __m256 v_data_a = *(__m256*)(source_a + i); - __m256 v_data_b = *(__m256*)(source_b + i); - __m256 v_results = _mm256_and_si256(v_data_a, v_data_b); - *(__m256*)(target + i) = v_results; + __m256i v_data_a = *(__m256i *)(source_a + i); + __m256i v_data_b = *(__m256i *)(source_b + i); + __m256i v_results = _mm256_and_si256(v_data_a, v_data_b); + *(__m256i *)(target + i) = v_results; } #endif } @@ -80,10 +80,10 @@ void avx2_and_not_bits( size_t i; for (i = 0; i < target_length; i += 32) { - __m256 v_data_a = *(__m256*)(source_a + i); - __m256 v_data_b = *(__m256*)(source_b + i); - __m256 v_results = _mm256_andnot_si256(v_data_a, v_data_b); - *(__m256*)(target + i) = v_results; + __m256i v_data_a = *(__m256i *)(source_a + i); + __m256i v_data_b = *(__m256i *)(source_b + i); + __m256i v_results = _mm256_andnot_si256(v_data_a, v_data_b); + *(__m256i *)(target + i) = v_results; } #endif } @@ -93,14 +93,14 @@ void avx2_not_bits( size_t target_length, uint8_t *source) { #if defined(AVX2_ENABLED) - __m256 ones = _mm256_set1_epi8(0xff); + __m256i ones = _mm256_set1_epi8(0xff); size_t i; for (i = 0; i < target_length; i += 32) { - __m256 v_data = *(__m256*)(source + i); - __m256 v_results = _mm256_xor_si256(v_data, ones); - *(__m256*)(target + i) = v_results; + __m256i v_data = *(__m256i *)(source + i); + __m256i v_results = _mm256_xor_si256(v_data, ones); + *(__m256i *)(target + i) = v_results; } #endif } @@ -114,10 +114,10 @@ void avx2_or_bits( size_t i; for (i = 0; i < target_length; i += 32) { - __m256 v_data_a = *(__m256*)(source_a + i); - __m256 v_data_b = *(__m256*)(source_b + i); - __m256 v_results = _mm256_or_si256(v_data_a, v_data_b); - *(__m256*)(target + i) = v_results; + __m256i v_data_a = *(__m256i *)(source_a + i); + __m256i v_data_b = *(__m256i *)(source_b + i); + __m256i v_results = _mm256_or_si256(v_data_a, v_data_b); + *(__m256i *)(target + i) = v_results; } #endif } @@ -131,10 +131,10 @@ void avx2_xor_bits( size_t i; for (i = 0; i < target_length; i += 32) { - __m256 v_data_a = *(__m256*)(source_a + i); - __m256 v_data_b = *(__m256*)(source_b + i); - __m256 v_results = _mm256_xor_si256(v_data_a, v_data_b); - *(__m256*)(target + i) = v_results; + __m256i v_data_a = *(__m256i *)(source_a + i); + __m256i v_data_b = *(__m256i *)(source_b + i); + __m256i v_results = _mm256_xor_si256(v_data_a, v_data_b); + *(__m256i *)(target + i) = v_results; } #endif } diff --git a/stack.yaml b/stack.yaml index 51cf563..2c7e66d 100644 --- a/stack.yaml +++ b/stack.yaml @@ -5,7 +5,7 @@ packages: extra-deps: - hw-hedgehog-0.1.0.2 -- hw-prim-0.6.2.12 +- hw-prim-0.6.2.14 - hw-rankselect-0.12.0.3 - lazy-csv-0.5.1 - sv-0.1