Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions cbits/simd_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ void avx2_cmpeq8(
#if defined(AVX2_ENABLED)
uint32_t *target32 = (uint32_t *)target;

__m256 v_comparand = _mm256_set1_epi8(byte);
__m256i v_comparand = _mm256_set1_epi8(byte);

uint32_t *out_mask = (uint32_t*)target;

size_t i;

for (i = 0; i < target_length * 2; ++i) {
__m256 v_data_a = *(__m256*)(source + (i * 32));
__m256 v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand);
__m256i v_data_a = *(__m256i *)(source + (i * 32));
__m256i v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand);
uint32_t mask = (uint32_t)_mm256_movemask_epi8(v_results_a);
target32[i] = mask;
}
Expand All @@ -63,10 +63,10 @@ void avx2_and_bits(
size_t i;

for (i = 0; i < target_length; i += 32) {
__m256 v_data_a = *(__m256*)(source_a + i);
__m256 v_data_b = *(__m256*)(source_b + i);
__m256 v_results = _mm256_and_si256(v_data_a, v_data_b);
*(__m256*)(target + i) = v_results;
__m256i v_data_a = *(__m256i *)(source_a + i);
__m256i v_data_b = *(__m256i *)(source_b + i);
__m256i v_results = _mm256_and_si256(v_data_a, v_data_b);
*(__m256i *)(target + i) = v_results;
}
#endif
}
Expand All @@ -80,10 +80,10 @@ void avx2_and_not_bits(
size_t i;

for (i = 0; i < target_length; i += 32) {
__m256 v_data_a = *(__m256*)(source_a + i);
__m256 v_data_b = *(__m256*)(source_b + i);
__m256 v_results = _mm256_andnot_si256(v_data_a, v_data_b);
*(__m256*)(target + i) = v_results;
__m256i v_data_a = *(__m256i *)(source_a + i);
__m256i v_data_b = *(__m256i *)(source_b + i);
__m256i v_results = _mm256_andnot_si256(v_data_a, v_data_b);
*(__m256i *)(target + i) = v_results;
}
#endif
}
Expand All @@ -93,14 +93,14 @@ void avx2_not_bits(
size_t target_length,
uint8_t *source) {
#if defined(AVX2_ENABLED)
__m256 ones = _mm256_set1_epi8(0xff);
__m256i ones = _mm256_set1_epi8(0xff);

size_t i;

for (i = 0; i < target_length; i += 32) {
__m256 v_data = *(__m256*)(source + i);
__m256 v_results = _mm256_xor_si256(v_data, ones);
*(__m256*)(target + i) = v_results;
__m256i v_data = *(__m256i *)(source + i);
__m256i v_results = _mm256_xor_si256(v_data, ones);
*(__m256i *)(target + i) = v_results;
}
#endif
}
Expand All @@ -114,10 +114,10 @@ void avx2_or_bits(
size_t i;

for (i = 0; i < target_length; i += 32) {
__m256 v_data_a = *(__m256*)(source_a + i);
__m256 v_data_b = *(__m256*)(source_b + i);
__m256 v_results = _mm256_or_si256(v_data_a, v_data_b);
*(__m256*)(target + i) = v_results;
__m256i v_data_a = *(__m256i *)(source_a + i);
__m256i v_data_b = *(__m256i *)(source_b + i);
__m256i v_results = _mm256_or_si256(v_data_a, v_data_b);
*(__m256i *)(target + i) = v_results;
}
#endif
}
Expand All @@ -131,10 +131,10 @@ void avx2_xor_bits(
size_t i;

for (i = 0; i < target_length; i += 32) {
__m256 v_data_a = *(__m256*)(source_a + i);
__m256 v_data_b = *(__m256*)(source_b + i);
__m256 v_results = _mm256_xor_si256(v_data_a, v_data_b);
*(__m256*)(target + i) = v_results;
__m256i v_data_a = *(__m256i *)(source_a + i);
__m256i v_data_b = *(__m256i *)(source_b + i);
__m256i v_results = _mm256_xor_si256(v_data_a, v_data_b);
*(__m256i *)(target + i) = v_results;
}
#endif
}
2 changes: 1 addition & 1 deletion stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ packages:

extra-deps:
- hw-hedgehog-0.1.0.2
- hw-prim-0.6.2.12
- hw-prim-0.6.2.14
- hw-rankselect-0.12.0.3
- lazy-csv-0.5.1
- sv-0.1