Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions z_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ namespace ZImage {
JointAttention(int64_t hidden_size, int64_t head_dim, int64_t num_heads, int64_t num_kv_heads, bool qk_norm)
: head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false);
float scale = 1.f;
#if GGML_USE_HIP
// Prevent NaN issues with certain ROCm setups
scale = 1.f / 16.f;
#endif
blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
if (qk_norm) {
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim);
Expand Down Expand Up @@ -93,7 +98,7 @@ namespace ZImage {
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using CUDA but the weights are k-quants.
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, 1.f / 128.f);
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
}

Expand Down Expand Up @@ -667,4 +672,4 @@ namespace ZImage {

} // namespace ZImage

#endif // __Z_IMAGE_HPP__
#endif // __Z_IMAGE_HPP__
Loading