This repository was archived by the owner on Aug 30, 2024. It is now read-only.
File tree Expand file tree Collapse file tree 2 files changed +7
-2
lines changed Expand file tree Collapse file tree 2 files changed +7
-2
lines changed Original file line number Diff line number Diff line change @@ -1174,7 +1174,12 @@ inline __m256 exp_ps_0_1(const __m256 x) {
1174
1174
static const auto log2e = _mm256_set1_ps (v_log2e);
1175
1175
static const auto half = _mm256_set1_ps (.5f );
1176
1176
1177
- const auto x1 = _mm256_fmadd_ps (x, log2e, half); // auto x1 = x * log2e + _mm256_set1_ps(.5f);
1177
+ static const auto upper_bound = _mm256_set1_ps (88.722838 ); // log(max_positive_float)
1178
+ static const auto lower_bound = _mm256_set1_ps (-87.336549 ); // log(min_positive_float)
1179
+ __m256 x1 = _mm256_min_ps (x, upper_bound);
1180
+ x1 = _mm256_max_ps (x1, lower_bound);
1181
+
1182
+ x1 = _mm256_fmadd_ps (x1, log2e, half); // auto x1 = x * log2e + _mm256_set1_ps(.5f);
1178
1183
const auto z = _mm256_floor_ps (x1);
1179
1184
const auto f = _mm256_sub_ps (x1, z); // auto f = x1 - z;
1180
1185
Original file line number Diff line number Diff line change @@ -74,7 +74,7 @@ bool bestla_reordered_attn_fp32_support(const attn_shape_t* params) {
74
74
#endif
75
75
// use avx2 and f16c on avx2 platforms
76
76
// todo: check avx2 mha on sever
77
- return false ;
77
+ return !_cd-> AVX512F () && _cd-> AVX2 () ;
78
78
}
79
79
// kv cache sizes in bytes per layer per batch per beam for;
80
80
void bestla_reordered_attn_fp32_batch_kv_info (const kv_shape_t * params, kv_cache_info_t * out) {
You can’t perform that action at this time.
0 commit comments