diff --git a/CODEOWNERS b/CODEOWNERS index bacc86cbbd6d2..908d13a35b922 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -89,6 +89,7 @@ /src/llama-model-loader.* @slaren /src/llama-model.* @CISC /src/llama-vocab.* @CISC +/src/models/ @CISC /tests/ @ggerganov /tests/test-backend-ops.cpp @slaren /tests/test-thread-safety.cpp @slaren diff --git a/common/arg.cpp b/common/arg.cpp index e8941cda41cef..d8f9bbd24301f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2030,7 +2030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.system_prompt.pop_back(); } } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 07645ad9e71d4..d671551c17103 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -190,8 +190,8 @@ static __global__ void mul_mat_vec_q( const uint32_t channel_bias = ids ? channel_x : channel_dst; - float x_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } }; - float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } }; + float x_biases[ncols_dst] = { 0.0f }; + float gate_biases[ncols_dst] = { 0.0f }; if constexpr (has_fusion) { if (use_bias) { x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0; @@ -199,8 +199,9 @@ static __global__ void mul_mat_vec_q( // 2. load only on threads that won't die after partial sum calculation if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) { +#pragma unroll for (int j = 0; j < ncols_dst; ++j) { - x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x]; + x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x]; } } } @@ -208,8 +209,9 @@ static __global__ void mul_mat_vec_q( gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0; if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) { +#pragma unroll for (int j = 0; j < ncols_dst; ++j) { - gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x]; + gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x]; } } } @@ -299,12 +301,12 @@ static __global__ void mul_mat_vec_q( float result = tmp[j][threadIdx.x]; if constexpr (has_fusion) { if (use_bias) { - result += x_biases[j][threadIdx.x]; + result += x_biases[j]; } if (use_gate) { float gate_value = tmp_gate[j][threadIdx.x]; if (use_gate_bias) { - gate_value += gate_biases[j][threadIdx.x]; + gate_value += gate_biases[j]; } switch (active_glu) { case GGML_GLU_OP_SWIGLU: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c3e5a9eccc3aa..8d1a85c96939b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -797,9 +797,18 @@ struct vk_mat_mat_push_constants { uint32_t padded_N; }; struct vk_mat_vec_push_constants { - uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; - uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; - uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t ncols; + uint32_t stride_a; + uint32_t stride_b; + uint32_t stride_d; + uint32_t batch_stride_a; + uint32_t batch_stride_b; + uint32_t batch_stride_d; + uint32_t enable_bias; + uint32_t ne02; + uint32_t ne12; + uint32_t broadcast2; + uint32_t broadcast3; }; struct vk_mat_mat_id_push_constants { @@ -810,9 +819,16 @@ struct vk_mat_mat_id_push_constants { uint32_t padded_N; }; struct vk_mat_vec_id_push_constants { - uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; - uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; - uint32_t nei0; uint32_t ne11; + uint32_t ncols; + uint32_t stride_a; + uint32_t stride_b; + uint32_t stride_d; + uint32_t batch_stride_a; + uint32_t batch_stride_b; + uint32_t batch_stride_d; + uint32_t enable_bias; + uint32_t nei0; + uint32_t ne11; }; struct vk_flash_attn_push_constants { @@ -3347,92 +3363,92 @@ static void ggml_vk_load_shaders(vk_device& device) { SHADER_REDUCTION_MODE_SHMEM; for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", 4, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", 4, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq, i+1}, 1, true, use_subgroups, subgroup_size_int); } #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT } } - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", mul_mat_vec_id_bf16_f32_len, mul_mat_vec_id_bf16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", mul_mat_vec_id_mxfp4_f32_len, mul_mat_vec_id_mxfp4_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", mul_mat_vec_id_bf16_f32_len, mul_mat_vec_id_bf16_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", mul_mat_vec_id_iq1_s_f32_len, mul_mat_vec_id_iq1_s_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", mul_mat_vec_id_iq1_m_f32_len, mul_mat_vec_id_iq1_m_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", mul_mat_vec_id_iq2_xs_f32_len, mul_mat_vec_id_iq2_xs_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", mul_mat_vec_id_iq2_s_f32_len, mul_mat_vec_id_iq2_s_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", mul_mat_vec_id_iq3_s_f32_len, mul_mat_vec_id_iq3_s_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", mul_mat_vec_id_iq4_xs_f32_len, mul_mat_vec_id_iq4_xs_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", mul_mat_vec_id_mxfp4_f32_len, mul_mat_vec_id_mxfp4_f32_data, "main", 5, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true); // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -3519,12 +3535,12 @@ static void ggml_vk_load_shaders(vk_device& device) { for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", 4, 7 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); } else { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 4, 7 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); } } - ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 4, 13 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -4258,8 +4274,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 && device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) && - vk12_features.runtimeDescriptorArray && - device->vendor_id != VK_VENDOR_ID_INTEL && getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr; device->shader_int64 = device_features2.features.shaderInt64; @@ -6501,7 +6515,11 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ GGML_UNUSED(k); } -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; @@ -6532,7 +6550,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1); bool batch_n = ne11 > 1; - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; @@ -6634,8 +6651,20 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& return; } - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + vk_buffer d_D; + uint64_t d_buf_offset = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)add->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(add) + add->view_offs; + } else { + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -6730,14 +6759,43 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& y_sz_total = CEIL_DIV(y_sz_total, 144) * 144; } + uint32_t enable_bias = ctx->num_additional_fused_ops > 0; + + vk_buffer d_B = d_D; + size_t b_buf_offset = 0; + uint64_t b_sz = 0; + + if (enable_bias) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + bool b_uma = false; + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, bias->data, d_B, b_buf_offset); + b_uma = d_B != nullptr; + } + if(!b_uma) { + ggml_backend_vk_buffer_context * bias_buf_ctx = (ggml_backend_vk_buffer_context *)bias->buffer->context; + d_B = bias_buf_ctx->dev_buffer; + b_buf_offset = vk_tensor_offset(bias) + bias->view_offs; + GGML_ASSERT(d_B != nullptr); + b_sz = ggml_nbytes(bias); + } + } + // compute const vk_mat_vec_push_constants pc = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, - stride_batch_x, stride_batch_y, stride_batch_d, + stride_batch_x, stride_batch_y, stride_batch_d, enable_bias, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, }; ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz_total }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, + { + vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz_total }, + vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, + vk_subbuffer{ d_B, b_buf_offset, b_sz }, + }, pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); if (x_non_contig) { @@ -6748,7 +6806,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } } -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; @@ -6771,7 +6832,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c GGML_ASSERT(ne11 == 1); - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; @@ -6805,8 +6865,19 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c return; } - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + vk_buffer d_D; + uint64_t d_buf_offset = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)add->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(add) + add->view_offs; + } else { + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } GGML_ASSERT(d_D != nullptr); vk_buffer d_Qx = src0_buf_ctx->dev_buffer; const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; @@ -6823,8 +6894,32 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; + uint32_t enable_bias = ctx->num_additional_fused_ops > 0; + + vk_buffer d_B = d_D; + size_t b_buf_offset = 0; + uint64_t b_sz = 0; + + if (enable_bias) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + bool b_uma = false; + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, bias->data, d_B, b_buf_offset); + b_uma = d_B != nullptr; + } + if(!b_uma) { + ggml_backend_vk_buffer_context * bias_buf_ctx = (ggml_backend_vk_buffer_context *)bias->buffer->context; + d_B = bias_buf_ctx->dev_buffer; + b_buf_offset = vk_tensor_offset(bias) + bias->view_offs; + GGML_ASSERT(d_B != nullptr); + b_sz = ggml_nbytes(bias); + } + } + // compute - const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; + const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), enable_bias }; uint32_t workgroups_z = (uint32_t)ne12; // When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups @@ -6832,10 +6927,19 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c workgroups_z /= gqa_ratio; } - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], + { + vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, + vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, + vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset }, + vk_subbuffer{ d_B, b_buf_offset, b_sz }, + }, pc, { 1, (uint32_t)ne01, workgroups_z }); } -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; @@ -6868,7 +6972,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(ne11 == 1); GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; @@ -6898,8 +7001,20 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con return; } - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + vk_buffer d_D; + uint64_t d_buf_offset = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)add->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(add) + add->view_offs; + } else { + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + GGML_ASSERT(d_D != nullptr); vk_buffer d_Qx = src0_buf_ctx->dev_buffer; const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; @@ -6916,13 +7031,45 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; + uint32_t enable_bias = ctx->num_additional_fused_ops > 0; + + vk_buffer d_B = d_D; + size_t b_buf_offset = 0; + uint64_t b_sz = 0; + + if (enable_bias) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; + + bool b_uma = false; + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, bias->data, d_B, b_buf_offset); + b_uma = d_B != nullptr; + } + if(!b_uma) { + ggml_backend_vk_buffer_context * bias_buf_ctx = (ggml_backend_vk_buffer_context *)bias->buffer->context; + d_B = bias_buf_ctx->dev_buffer; + b_buf_offset = vk_tensor_offset(bias) + bias->view_offs; + GGML_ASSERT(d_B != nullptr); + b_sz = ggml_nbytes(bias); + } + } + // compute - const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 }; + const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23, enable_bias }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); + { + vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, + vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, + vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset }, + vk_subbuffer{ d_B, b_buf_offset, b_sz }, + }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); // Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases @@ -6961,15 +7108,15 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, g src1->nb[1] <= src1->nb[3] && src0->ne[3] == 1 && src1->ne[3] == 1) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun); + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx, dryrun); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun); + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx, dryrun); // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) // when ne12 and ne13 are one. } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun); + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx, dryrun); } else { ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false, dryrun); } @@ -7249,7 +7396,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } } -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + ggml_tensor * ids = dst->src[2]; VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -7281,7 +7432,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t ne22 = dst->ne[2]; const uint64_t ne23 = dst->ne[3]; - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; @@ -7369,8 +7519,20 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte return; } - vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + vk_buffer d_D; + uint64_t d_buf_offset = 0; + + if (ctx->num_additional_fused_ops > 0) { + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)add->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(add) + add->view_offs; + } else { + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + d_D = dst_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -7445,15 +7607,46 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte groups_x = CEIL_DIV(groups_x, groups_z); } + uint32_t enable_bias = ctx->num_additional_fused_ops > 0; + + vk_buffer d_B = d_D; + size_t b_buf_offset = 0; + uint64_t b_sz = 0; + + if (enable_bias) { + const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1]; + + bool b_uma = false; + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, bias->data, d_B, b_buf_offset); + b_uma = d_B != nullptr; + } + if(!b_uma) { + ggml_backend_vk_buffer_context * bias_buf_ctx = (ggml_backend_vk_buffer_context *)bias->buffer->context; + d_B = bias_buf_ctx->dev_buffer; + b_buf_offset = vk_tensor_offset(bias) + bias->view_offs; + GGML_ASSERT(d_B != nullptr); + b_sz = ggml_nbytes(bias); + } + } + // compute const vk_mat_vec_id_push_constants pc = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21), + + enable_bias, + (uint32_t)nei0, (uint32_t)ne11, }; ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, - vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, + { + vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, + vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, + vk_subbuffer{ d_B, b_buf_offset, b_sz }, + vk_subbuffer{ d_ids, ids_buf_offset, ids_sz }, + }, pc, { groups_x, (uint32_t)nei0, groups_z }); if (x_non_contig) { @@ -7464,10 +7657,21 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte } } -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { +static bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src2 = dst->src[2]; + return src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)); +} + +static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + ggml_tensor * dst = cgraph->nodes[node_idx]; + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + ggml_tensor * src2 = dst->src[2]; VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); - if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { + ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx, dryrun); } else { ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } @@ -8433,7 +8637,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } } -static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t) +static uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t) { return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));; } @@ -11793,11 +11997,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx, dryrun); break; case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); + ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx, dryrun); break; @@ -12474,7 +12678,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { +static bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { if (!ggml_can_fuse(cgraph, node_idx, ops)) { return false; } @@ -12502,6 +12706,61 @@ static bool ggml_vk_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, st return false; } } + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_MUL_MAT && ops.begin()[1] == GGML_OP_ADD) { + // additional constraints specific to this fusion + const ggml_tensor *mul = cgraph->nodes[node_idx]; + const ggml_tensor *add = cgraph->nodes[node_idx + 1]; + const ggml_tensor *bias = add->src[0] == mul ? add->src[1] : add->src[0]; + + // mat-vec only + if (ggml_nrows(mul) != 1) { + return false; + } + // shaders assume the types match + if (mul->type != bias->type) { + return false; + } + // shaders reuse the D shape for bias + if (!ggml_are_same_shape(mul, bias) || + !ggml_are_same_stride(mul, bias)) { + return false; + } + // unaligned bias isn't handled + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + } + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_ADD_ID) { + // additional constraints specific to this fusion + const ggml_tensor *mul = cgraph->nodes[node_idx]; + const ggml_tensor *add = cgraph->nodes[node_idx + 1]; + const ggml_tensor *bias = add->src[1]; + + if (mul != add->src[0]) { + return false; + } + // mat-vec only + if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { + return false; + } + // shaders assume the types match + if (mul->type != bias->type) { + return false; + } + // shaders assume the bias is contiguous + if (!ggml_is_contiguous(bias)) { + return false; + } + // the ID tensor must be the same for mul_mat_id and add_id + if (mul->src[2] != add->src[2]) { + return false; + } + // unaligned bias isn't handled + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + } + return true; } @@ -12670,7 +12929,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); if (num_adds) { ctx->num_additional_fused_ops = num_adds - 1; - } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { + ctx->num_additional_fused_ops = 1; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { ctx->num_additional_fused_ops = 1; } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && @@ -12783,7 +13046,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); if (num_adds) { ctx->num_additional_fused_ops = num_adds - 1; - } else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { + ctx->num_additional_fused_ops = 1; + } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { ctx->num_additional_fused_ops = 1; } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && @@ -13005,7 +13272,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * for (int c = first_unused; c < j; ++c) { if (!used[c] && is_src_of(graph->nodes[j], graph->nodes[c]) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL)) { + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID)) { ok = false; break; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl index 450dee0408741..bbb4d1206b7e4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl @@ -28,8 +28,11 @@ layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; #endif layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +layout (binding = 3) readonly buffer Bias {D_TYPE data_bias[];}; + #ifdef MUL_MAT_ID -layout (binding = 3) readonly buffer IDS {int data_ids[];}; +layout (binding = 4) readonly buffer IDS {int data_ids[];}; #endif #include "dequant_funcs.glsl" @@ -45,6 +48,8 @@ layout (push_constant) uniform parameter uint batch_stride_b; uint batch_stride_d; + uint enable_bias; + #ifdef MUL_MAT_ID uint nei0; uint ne11; @@ -56,6 +61,10 @@ layout (push_constant) uniform parameter #endif } p; +#ifdef MUL_MAT_ID +uint expert_id; +#endif + void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) { #ifdef MUL_MAT_ID const uint expert_idx = gl_GlobalInvocationID.y; @@ -75,7 +84,7 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) { batch_idx_a = i03 * p.ne02 + i02; } #else - const uint expert_id = data_ids[expert_idx]; + expert_id = data_ids[expert_idx]; #endif a_offset = @@ -113,6 +122,13 @@ void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t if (tid == 0) { [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { + if (p.enable_bias != 0) { +#ifdef MUL_MAT_ID + temp[j][n] += FLOAT_TYPE(data_bias[expert_id*p.stride_d + first_row + n]); +#else + temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]); +#endif + } data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]); } } @@ -148,6 +164,13 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs [[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) { temp[j][n] += tmpsh[j][n][s]; } + if (p.enable_bias != 0) { +#ifdef MUL_MAT_ID + temp[j][n] += FLOAT_TYPE(data_bias[expert_id*p.stride_d + first_row + n]); +#else + temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]); +#endif + } data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]); } } @@ -173,6 +196,13 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs if (tid == 0) { [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { [[unroll]] for (uint n = 0; n < num_rows; ++n) { + if (p.enable_bias != 0) { +#ifdef MUL_MAT_ID + tmpsh[j][n][0] += FLOAT_TYPE(data_bias[expert_id*p.stride_d + first_row + n]); +#else + tmpsh[j][n][0] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]); +#endif + } data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp index 638878d94ce08..3f4584c984c1f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp @@ -15,6 +15,8 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; layout (binding = 0) readonly buffer AV4 {A_TYPE_VEC4 data_a_v4[];}; layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; +layout (binding = 3) readonly buffer Bias {D_TYPE data_bias[];}; + layout (push_constant) uniform parameter { uint ncols_x; @@ -29,6 +31,7 @@ layout (push_constant) uniform parameter uint nb03; uint nb13; uint nb23; + uint enable_bias; } p; shared FLOAT_TYPE tmp[BLOCK_SIZE]; @@ -117,6 +120,9 @@ void main() { } if (tid == 0) { + if (p.enable_bias != 0) { + tmp[0] += FLOAT_TYPE(data_bias[idst]); + } dst[idst] = tmp[0]; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp index 7aa070eebdf72..d51424d417573 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp @@ -17,6 +17,8 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; layout (binding = 0) readonly buffer AV4 {A_TYPE_VEC4 data_a_v4[];}; layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; +layout (binding = 3) readonly buffer Bias {D_TYPE data_bias[];}; + layout(constant_id = 0) const int BLOCK_SIZE = 32; // gqa_ratio is in the range [1,8] layout(constant_id = 1) const uint gqa_ratio = 1; @@ -29,6 +31,7 @@ layout (push_constant) uniform parameter uint nchannels_y; uint b_offset; uint d_offset; + uint enable_bias; } p; #if !USE_SUBGROUP_ADD @@ -148,6 +151,9 @@ void main() { [[unroll]] for (uint c = 0; c < gqa_ratio; ++c) { // dst is not transposed and not permuted const uint idst = (channel + c)*nrows_dst + row_dst; + if (p.enable_bias != 0) { + temp[c] += FLOAT_TYPE(data_bias[idst]); + } dst[idst] = temp[c]; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp index 1e8f694a72470..10cf5202a4a37 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp @@ -23,16 +23,100 @@ layout (push_constant) uniform parameter2 uint rms_partials; } p; -// Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498 -// layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; -// layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; -layout (binding = 0) buffer A {A_TYPE data_a[];} a[]; -layout (binding = 0) buffer D {D_TYPE data_d[];} d[]; - -layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[]; +// No readonly/writeonly decorations. Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498 +layout (binding = 0) buffer A0 {A_TYPE data_a[];} a0; +layout (binding = 1) buffer A1 {A_TYPE data_a[];} a1; +layout (binding = 2) buffer A2 {A_TYPE data_a[];} a2; +layout (binding = 3) buffer A3 {A_TYPE data_a[];} a3; +layout (binding = 4) buffer A4 {A_TYPE data_a[];} a4; +layout (binding = 5) buffer A5 {A_TYPE data_a[];} a5; +layout (binding = 6) buffer A6 {A_TYPE data_a[];} a6; +layout (binding = 7) buffer A7 {A_TYPE data_a[];} a7; +layout (binding = 8) buffer A8 {A_TYPE data_a[];} a8; +layout (binding = 9) buffer A9 {A_TYPE data_a[];} a9; +layout (binding = 10) buffer A10 {A_TYPE data_a[];} a10; +layout (binding = 11) buffer A11 {A_TYPE data_a[];} a11; +layout (binding = 0) buffer D0 {D_TYPE data_d[];} d0; +layout (binding = 1) buffer D1 {D_TYPE data_d[];} d1; +layout (binding = 2) buffer D2 {D_TYPE data_d[];} d2; +layout (binding = 3) buffer D3 {D_TYPE data_d[];} d3; +layout (binding = 4) buffer D4 {D_TYPE data_d[];} d4; +layout (binding = 5) buffer D5 {D_TYPE data_d[];} d5; +layout (binding = 6) buffer D6 {D_TYPE data_d[];} d6; +layout (binding = 7) buffer D7 {D_TYPE data_d[];} d7; +layout (binding = 8) buffer D8 {D_TYPE data_d[];} d8; +layout (binding = 9) buffer D9 {D_TYPE data_d[];} d9; +layout (binding = 10) buffer D10 {D_TYPE data_d[];} d10; +layout (binding = 11) buffer D11 {D_TYPE data_d[];} d11; +layout (binding = 0, std430) buffer PartialBuf0 {float partial_sums[];} partials0; +layout (binding = 1, std430) buffer PartialBuf1 {float partial_sums[];} partials1; +layout (binding = 2, std430) buffer PartialBuf2 {float partial_sums[];} partials2; +layout (binding = 3, std430) buffer PartialBuf3 {float partial_sums[];} partials3; +layout (binding = 4, std430) buffer PartialBuf4 {float partial_sums[];} partials4; +layout (binding = 5, std430) buffer PartialBuf5 {float partial_sums[];} partials5; +layout (binding = 6, std430) buffer PartialBuf6 {float partial_sums[];} partials6; +layout (binding = 7, std430) buffer PartialBuf7 {float partial_sums[];} partials7; +layout (binding = 8, std430) buffer PartialBuf8 {float partial_sums[];} partials8; +layout (binding = 9, std430) buffer PartialBuf9 {float partial_sums[];} partials9; +layout (binding = 10, std430) buffer PartialBuf10 {float partial_sums[];} partials10; +layout (binding = 11, std430) buffer PartialBuf11 {float partial_sums[];} partials11; layout(constant_id = 0) const uint num_srcs = 2; +FLOAT_TYPE load_a(uint b, uint i) { + switch (b) { + case 0: return FLOAT_TYPE(a0.data_a[i]); + case 1: return FLOAT_TYPE(a1.data_a[i]); + case 2: return FLOAT_TYPE(a2.data_a[i]); + case 3: return FLOAT_TYPE(a3.data_a[i]); + case 4: return FLOAT_TYPE(a4.data_a[i]); + case 5: return FLOAT_TYPE(a5.data_a[i]); + case 6: return FLOAT_TYPE(a6.data_a[i]); + case 7: return FLOAT_TYPE(a7.data_a[i]); + case 8: return FLOAT_TYPE(a8.data_a[i]); + case 9: return FLOAT_TYPE(a9.data_a[i]); + case 10: return FLOAT_TYPE(a10.data_a[i]); + case 11: return FLOAT_TYPE(a11.data_a[i]); + default: return FLOAT_TYPE(0); + } +} + +void store_d(uint b, uint i, FLOAT_TYPE v) { + switch (b) { + case 0: d0.data_d[i] = D_TYPE(v); break; + case 1: d1.data_d[i] = D_TYPE(v); break; + case 2: d2.data_d[i] = D_TYPE(v); break; + case 3: d3.data_d[i] = D_TYPE(v); break; + case 4: d4.data_d[i] = D_TYPE(v); break; + case 5: d5.data_d[i] = D_TYPE(v); break; + case 6: d6.data_d[i] = D_TYPE(v); break; + case 7: d7.data_d[i] = D_TYPE(v); break; + case 8: d8.data_d[i] = D_TYPE(v); break; + case 9: d9.data_d[i] = D_TYPE(v); break; + case 10: d10.data_d[i] = D_TYPE(v); break; + case 11: d11.data_d[i] = D_TYPE(v); break; + default: break; + } +} + +void store_partial(uint b, uint i, float v) { + switch (b) { + case 0: partials0.partial_sums[i] = v; break; + case 1: partials1.partial_sums[i] = v; break; + case 2: partials2.partial_sums[i] = v; break; + case 3: partials3.partial_sums[i] = v; break; + case 4: partials4.partial_sums[i] = v; break; + case 5: partials5.partial_sums[i] = v; break; + case 6: partials6.partial_sums[i] = v; break; + case 7: partials7.partial_sums[i] = v; break; + case 8: partials8.partial_sums[i] = v; break; + case 9: partials9.partial_sums[i] = v; break; + case 10: partials10.partial_sums[i] = v; break; + case 11: partials11.partial_sums[i] = v; break; + default: break; + } +} + uint src_idx(uint s, uint i00, uint i01, uint i02, uint i03) { return i03*p.nb[s][3] + i02*p.nb[s][2] + i01*p.nb[s][1] + i00*p.nb[s][0]; } @@ -78,10 +162,10 @@ void main() { FLOAT_TYPE sum = FLOAT_TYPE(0); [[unroll]] for (uint s = 0; s < num_srcs; ++s) { - sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]); + sum += load_a(s, src_idx(s, i00, i01, i02, i03)); } sum_sq += sum*sum; - d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum); + store_d(num_srcs, dst_idx(i00, i01, i02, i03), sum); idx += num_threads; } @@ -104,7 +188,7 @@ void main() { } if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) { - partials[num_srcs + 1].partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq; + store_partial(num_srcs + 1, orig_idx / (num_iter * num_threads), sum_sq); } } #endif diff --git a/scripts/bench-models.sh b/scripts/bench-models.sh new file mode 100644 index 0000000000000..744b0de359c5f --- /dev/null +++ b/scripts/bench-models.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +RESULTS="bench-models-results.txt" +: > "$RESULTS" + +ARGS_BB="-c 270336 -npp 512,4096,8192 -npl 1,2,4,8,16,32 -ntg 32" +ARGS_B="-d 0,4096,8192,16384,32768 -p 2048 -n 32" + +QUICK=0 +while (( "$#" )); do + case "$1" in + --quick) QUICK=1; shift ;; + *) shift ;; + esac +done + +if (( QUICK )); then + ARGS_BB="-c 20480 -npp 512,4096 -npl 1,2,4 -ntg 32" + ARGS_B="-d 0 -p 2048 -n 32" +fi + +run_model() { + local HFR=$1 + local HFF=$2 + + printf "## ${HFR}\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" + printf "Model: https://huggingface.co/${HFR}\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" + + printf -- "- \`llama-batched-bench\`\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" + + ./bin/llama-batched-bench \ + -hfr "${HFR}" -hff "${HFF}" \ + -m "${HFF}" -fa 1 -ub 2048 --no-mmap \ + ${ARGS_BB} | tee -a "$RESULTS" + + printf "\n" | tee -a "$RESULTS" + + printf -- "- \`llama-bench\`\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" + + ./bin/llama-bench \ + -m "${HFF}" -fa 1 -ub 2048 -mmp 0 \ + ${ARGS_B} | tee -a "$RESULTS" + + printf "\n" | tee -a "$RESULTS" + + printf "\n" +} + +run_model "ggml-org/gpt-oss-20b-GGUF" "gpt-oss-20b-mxfp4.gguf" +run_model "ggml-org/gpt-oss-120b-GGUF" "gpt-oss-120b-mxfp4-00001-of-00003.gguf" +run_model "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" "qwen3-coder-30b-a3b-instruct-q8_0.gguf" +run_model "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" "qwen2.5-coder-7b-q8_0.gguf" +run_model "ggml-org/gemma-3-4b-it-qat-GGUF" "gemma-3-4b-it-qat-Q4_0.gguf" + +if [[ -f models-extra.txt ]]; then + while read -r HFR HFF; do + [[ -z "$HFR" ]] && continue + run_model "$HFR" "$HFF" + done < models-extra.txt +fi + +printf "\n=====================================\n" +printf "\n" + +cat "$RESULTS" + +printf "\n" +printf "Done! Results are written to $RESULTS\n" +printf "\n" + diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp index fcfcd80771c51..f1ab27cd54d0a 100644 --- a/tools/batched-bench/batched-bench.cpp +++ b/tools/batched-bench/batched-bench.cpp @@ -221,7 +221,5 @@ int main(int argc, char ** argv) { llama_backend_free(); - LOG("\n\n"); - return 0; } diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 311a4c9086a53..c7e9498349c1b 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -154,8 +154,8 @@ enum projector_type { PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_LIGHTONOCR, - PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_COGVLM, + PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b312fda637f3b..dcfdb49600b6c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -171,8 +171,10 @@ struct clip_hparams { int32_t n_head; int32_t n_layer; // idefics3 - int32_t preproc_image_size = 0; // aka max_dimension - int32_t proj_scale_factor = 0; + int32_t image_longest_edge = 0; + int32_t image_min_pixels = 0; + int32_t image_max_pixels = 0; + int32_t n_merge = 0; // number of patch merges **per-side** float image_mean[3]; float image_std[3]; @@ -194,7 +196,6 @@ struct clip_hparams { std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; - int32_t spatial_merge_size = 0; // audio int32_t n_mel_bins = 0; // whisper preprocessor @@ -204,6 +205,21 @@ struct clip_hparams { bool has_llava_projector = false; int minicpmv_version = 0; int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + image_min_pixels = n_tokens_min * patch_area; + image_max_pixels = n_tokens_max * patch_area; + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + } }; struct clip_layer { @@ -532,7 +548,7 @@ struct clip_graph { const int batch_size = 1; GGML_ASSERT(n_patches_x == n_patches_y); const int patches_per_image = n_patches_x; - const int kernel_size = hparams.proj_scale_factor; + const int kernel_size = hparams.n_merge; cur = ggml_transpose(ctx0, cur); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); @@ -554,13 +570,13 @@ struct clip_graph { } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { // pixel_shuffle // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); cur = ggml_mul_mat(ctx0, model.projection, cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); // projection @@ -584,7 +600,7 @@ struct clip_graph { } ggml_cgraph * build_pixtral() { - const int n_merge = hparams.spatial_merge_size; + const int n_merge = hparams.n_merge; // 2D input positions ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); @@ -610,7 +626,7 @@ struct clip_graph { // mistral small 3.1 patch merger // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 if (model.mm_patch_merger_w) { - GGML_ASSERT(hparams.spatial_merge_size > 0); + GGML_ASSERT(hparams.n_merge > 0); cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); @@ -926,7 +942,7 @@ struct clip_graph { // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl // loop over layers for (int il = 0; il < n_layer; il++) { @@ -1149,7 +1165,7 @@ struct clip_graph { // pixel shuffle { - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; const int bsz = 1; // batch size, always 1 for now since we don't support batching const int height = n_patches_y; const int width = n_patches_x; @@ -1239,7 +1255,7 @@ struct clip_graph { // based on Llama4VisionPixelShuffleMLP // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 { - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; const int bsz = 1; // batch size, always 1 for now since we don't support batching GGML_ASSERT(scale_factor > 0); GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images @@ -1311,7 +1327,7 @@ struct clip_graph { { // patch_merger - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); // projection norm @@ -2577,7 +2593,6 @@ struct clip_model_loader { if (is_vision) { get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false); get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy @@ -2686,65 +2701,68 @@ struct clip_model_loader { hparams.minicpmv_version = 2; // default to 2 if not set } } break; + case PROJECTOR_TYPE_INTERNVL: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); + } break; case PROJECTOR_TYPE_LFM2: - case PROJECTOR_TYPE_INTERNVL: { - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json + hparams.set_limit_image_tokens(64, 256); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { + // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json + // TODO: verify the image_min_tokens hparams.rope_theta = 10000.0f; - hparams.warmup_image_size = hparams.patch_size * 8; - // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM - // ref: https://github.com/ggml-org/llama.cpp/issues/14310 - hparams.image_size = 1024; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_KIMIVL: { hparams.rope_theta = 10000.0f; - hparams.warmup_image_size = hparams.patch_size * 8; - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // TODO: check kimivl preprocessor for exact values + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_GEMMA3: { // default value (used by all model sizes in gemma 3 family) // number of patches for each **side** is reduced by a factor of 4 - hparams.proj_scale_factor = 4; + hparams.n_merge = 4; // test model (tinygemma3) has a different value, we optionally read it - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; case PROJECTOR_TYPE_QWEN2VL: - { - // max image size = sqrt(max_pixels) = 3584 - // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json - // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable - // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 - hparams.image_size = 1024; - hparams.warmup_image_size = hparams.patch_size * 8; - } break; case PROJECTOR_TYPE_QWEN25VL: - { - // max image size = sqrt(max_pixels) - // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json - // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable - // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 - hparams.image_size = 1024; - hparams.warmup_image_size = hparams.patch_size * 8; - get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); - } break; case PROJECTOR_TYPE_QWEN3VL: { - hparams.image_size = 1024; // still need this? - hparams.warmup_image_size = hparams.patch_size * 8; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); + hparams.n_merge = 2; // default value for Qwen 2 and 2.5 + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it + // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json + // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens + // but we set a lower value to avoid OOM + // TODO: make it configurable by user + // TODO (2): bbox coordinates become inaccurate with small number of tokens, + // therefore we need to increase the min_tokens + // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858 + hparams.set_limit_image_tokens(8, 2048); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_LLAMA4: { hparams.rope_theta = 10000.0f; - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); set_llava_uhd_res_candidates(model, 3); } break; case PROJECTOR_TYPE_ULTRAVOX: @@ -2777,10 +2795,13 @@ struct clip_model_loader { LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size); LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); - LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); - if (hparams.spatial_merge_size > 0) { - LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size); + if (hparams.image_min_pixels > 0) { + LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels); + } + if (hparams.image_max_pixels > 0) { + LOG_INF("%s: image_max_pixels: %d\n", __func__, hparams.image_max_pixels); } } else if (is_audio) { LOG_INF("\n--- audio hparams ---\n"); @@ -3181,9 +3202,11 @@ struct clip_model_loader { if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { img->nx = hparams.warmup_image_size; img->ny = hparams.warmup_image_size; + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); } else { img->nx = hparams.warmup_audio_size; img->ny = hparams.n_mel_bins; + LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); } batch.entries.push_back(std::move(img)); @@ -3399,9 +3422,169 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 // set of tools to manupulate images // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv -struct image_manipulation { +struct img_tool { + enum resize_algo { + RESIZE_ALGO_BILINEAR, + RESIZE_ALGO_BICUBIC, + // RESIZE_ALGO_LANCZOS, // TODO + }; + + static void resize( + const clip_image_u8 & src, + clip_image_u8 & dst, + const clip_image_size & target_resolution, + resize_algo algo, + bool add_padding = true, // TODO: define the behavior for add_padding = false + std::array pad_color = {0, 0, 0}) { + dst.nx = target_resolution.width; + dst.ny = target_resolution.height; + dst.buf.resize(3 * dst.nx * dst.ny); + + if (dst.nx == src.nx && dst.ny == src.ny) { + // no resize needed, simple copy + dst.buf = src.buf; + return; + } + + if (!add_padding) { + // direct resize + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, dst, target_resolution.width, target_resolution.height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + } else { + // resize with padding + clip_image_u8 resized_image; + float scale_w = static_cast(target_resolution.width) / src.nx; + float scale_h = static_cast(target_resolution.height) / src.ny; + float scale = std::min(scale_w, scale_h); + int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, resized_image, new_width, new_height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + + // fill dst with pad_color + fill(dst, pad_color); + + int offset_x = (target_resolution.width - new_width) / 2; + int offset_y = (target_resolution.height - new_height) / 2; + + composite(dst, resized_image, offset_x, offset_y); + } + } + + static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than longest_edge, it will be resized to longest_edge + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { + GGML_ASSERT(align_size > 0); + if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { + return {0, 0}; + } + + float scale = std::min(static_cast(longest_edge) / inp_size.width, + static_cast(longest_edge) / inp_size.height); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + int aligned_width = ceil_by_factor(target_width_f); + int aligned_height = ceil_by_factor(target_height_f); + + return {aligned_width, aligned_height}; + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will have min_pixels <= W*H <= max_pixels + // this is referred as "smart_resize" in transformers code + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + GGML_ASSERT(align_size > 0); + const int width = inp_size.width; + const int height = inp_size.height; + + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + // always align up first + int h_bar = std::max(align_size, ceil_by_factor(height)); + int w_bar = std::max(align_size, ceil_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt(static_cast(height * width) / max_pixels); + h_bar = std::max(align_size, floor_by_factor(height / beta)); + w_bar = std::max(align_size, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(static_cast(min_pixels) / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + return {w_bar, h_bar}; + } + + // draw src image into dst image at offset (offset_x, offset_y) + static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + for (int y = 0; y < src.ny; ++y) { + for (int x = 0; x < src.nx; ++x) { + int dx = x + offset_x; + int dy = y + offset_y; + // skip pixels that would be out of bounds in the destination + if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + continue; + } + size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); + size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); + dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + } + } + } + + // fill the image with a solid color + static void fill(clip_image_u8 & img, const std::array & color) { + for (size_t i = 0; i < img.buf.size(); i += 3) { + img.buf[i] = color[0]; + img.buf[i + 1] = color[1]; + img.buf[i + 2] = color[2]; + } + } + +private: // Bilinear resize function - static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); @@ -3437,7 +3620,7 @@ struct image_manipulation { // Bicubic resize function // part of image will be cropped if the aspect ratio is different - static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { const int nx = img.nx; const int ny = img.ny; @@ -3500,93 +3683,6 @@ struct image_manipulation { return true; } - // llava-1.6 type of resize_and_pad - // if the ratio is not 1:1, padding with pad_color will be applied - // pad_color is single channel, default is 0 (black) - static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array pad_color = {0, 0, 0}) { - int target_width = target_resolution.width; - int target_height = target_resolution.height; - - float scale_w = static_cast(target_width) / image.nx; - float scale_h = static_cast(target_height) / image.ny; - - int new_width, new_height; - - if (scale_w < scale_h) { - new_width = target_width; - new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); - } else { - new_height = target_height; - new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); - } - - clip_image_u8 resized_image; - bicubic_resize(image, resized_image, new_width, new_height); - - clip_image_u8 padded_image; - padded_image.nx = target_width; - padded_image.ny = target_height; - padded_image.buf.resize(3 * target_width * target_height); - - // Fill the padded image with the fill color - for (size_t i = 0; i < padded_image.buf.size(); i += 3) { - padded_image.buf[i] = pad_color[0]; - padded_image.buf[i + 1] = pad_color[1]; - padded_image.buf[i + 2] = pad_color[2]; - } - - // Calculate padding offsets - int pad_x = (target_width - new_width) / 2; - int pad_y = (target_height - new_height) / 2; - - // Copy the resized image into the center of the padded buffer - for (int y = 0; y < new_height; ++y) { - for (int x = 0; x < new_width; ++x) { - for (int c = 0; c < 3; ++c) { - padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; - } - } - } - dst = std::move(padded_image); - } - - static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; - } - } - } - - // calculate the size of the **resized** image, while preserving the aspect ratio - // the calculated size will be aligned to the nearest multiple of align_size - // if H or W size is larger than max_dimension, it will be resized to max_dimension - static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { - if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { - return {0, 0}; - } - - float scale = std::min(static_cast(max_dimension) / inp_size.width, - static_cast(max_dimension) / inp_size.height); - - float target_width_f = static_cast(inp_size.width) * scale; - float target_height_f = static_cast(inp_size.height) * scale; - - int aligned_width = CLIP_ALIGN((int)target_width_f, align_size); - int aligned_height = CLIP_ALIGN((int)target_height_f, align_size); - - return {aligned_width, aligned_height}; - } - -private: static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -3735,10 +3831,11 @@ struct llava_uhd { static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { std::vector output; + img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); - image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size); + img_tool::resize(*img, *resized_img, inst.overview_size, interpolation); output.push_back(std::move(resized_img)); if (inst.slices.empty()) { // no slices, just return the resized image @@ -3748,9 +3845,11 @@ struct llava_uhd { // resize to refined size clip_image_u8_ptr refined_img(clip_image_u8_init()); if (inst.padding_refined) { - image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size); + img_tool::resize(*img, *refined_img, inst.refined_size, interpolation); } else { - image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height); + // only algo bicubic preserves the ratio; old models rely on this behavior + // TODO: do we need to support other algos here? + img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false); } // create slices @@ -3761,7 +3860,7 @@ struct llava_uhd { int h = slice.size.height; clip_image_u8_ptr img_slice(clip_image_u8_init()); - image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h); + img_tool::crop(*refined_img, *img_slice, x, y, w, h); output.push_back(std::move(img_slice)); } @@ -3896,208 +3995,211 @@ struct llava_uhd { // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { clip_image_size original_size{img->nx, img->ny}; - bool pad_to_square = true; auto & params = ctx->model.hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { - pad_to_square = false; - } - if (clip_is_minicpmv(ctx)) { - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_MINICPMV: + { + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - return true; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + { + // step 1: make a blank canvas which aligns to the grid + clip_image_u8 resized; + const clip_image_size new_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * 2, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + // clip_image_save_to_bmp(resized, "preproc.bmp"); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + // res_imgs->data[0] = *res; + res_imgs->entries.push_back(std::move(img_f32)); + } break; - } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - clip_image_u8 resized; - auto patch_size = params.patch_size * 2; - auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size); - image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height); - - clip_image_f32_ptr img_f32(clip_image_f32_init()); - // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); - // res_imgs->data[0] = *res; - res_imgs->entries.push_back(std::move(img_f32)); - return true; - } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { - // The refined size has two steps: - // 1. Resize w/ aspect-ratio preserving such that the longer side is - // the preprocessor longest size - // 2. Resize w/out preserving aspect ratio such that both sides are - // multiples of image_size (always rounding up) - // - // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio( - original_size, params.image_size, params.preproc_image_size); - // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", - // __func__, original_size.width, original_size.height, - // refined_size.width, refined_size.height); - - llava_uhd::slice_instructions instructions; - instructions.overview_size = clip_image_size{params.image_size, params.image_size}; - instructions.refined_size = refined_size; - instructions.grid_size = clip_image_size{ - static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), - static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), - }; - for (int y = 0; y < refined_size.height; y += params.image_size) { - for (int x = 0; x < refined_size.width; x += params.image_size) { - // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); - instructions.slices.push_back(llava_uhd::slice_coordinates{ - /* x */x, - /* y */y, - /* size */clip_image_size{ - std::min(params.image_size, refined_size.width - x), - std::min(params.image_size, refined_size.height - y) + case PROJECTOR_TYPE_IDEFICS3: + { + // The refined size has two steps: + // 1. Resize w/ aspect-ratio preserving such that the longer side is + // the preprocessor longest size + // 2. Resize w/out preserving aspect ratio such that both sides are + // multiples of image_size (always rounding up) + // + // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 + const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( + original_size, params.image_size, params.image_longest_edge); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); + + llava_uhd::slice_instructions instructions; + instructions.overview_size = clip_image_size{params.image_size, params.image_size}; + instructions.refined_size = refined_size; + instructions.grid_size = clip_image_size{ + static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), + static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), + }; + for (int y = 0; y < refined_size.height; y += params.image_size) { + for (int x = 0; x < refined_size.width; x += params.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); + instructions.slices.push_back(llava_uhd::slice_coordinates{ + /* x */x, + /* y */y, + /* size */clip_image_size{ + std::min(params.image_size, refined_size.width - x), + std::min(params.image_size, refined_size.height - y) + } + }); } - }); - } - } - auto imgs = llava_uhd::slice_image(img, instructions); - - // cast and normalize to f32 - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = instructions.grid_size.width; - res_imgs->grid_y = instructions.grid_size.height; - return true; - } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE - || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3 - || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution - ) { - clip_image_u8 resized_image; - int sz = params.image_size; - image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - return true; - - } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL - || ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR - ) { - clip_image_u8 resized_image; - auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); - image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - return true; - - } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) { - GGML_ASSERT(!params.image_res_candidates.empty()); - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - return true; - - } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2 - || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL - ) { - GGML_ASSERT(params.proj_scale_factor); + } + auto imgs = llava_uhd::slice_image(img, instructions); + + // cast and normalize to f32 + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - // smart resize - const int width = img->nx; - const int height = img->ny; - const int total_factor = params.patch_size * params.proj_scale_factor; - constexpr int min_image_tokens = 64; - constexpr int max_image_tokens = 1024; - const float min_pixels = min_image_tokens * total_factor * total_factor; - const float max_pixels = max_image_tokens * total_factor * total_factor; + res_imgs->grid_x = instructions.grid_size.width; + res_imgs->grid_y = instructions.grid_size.height; + } break; - auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; - auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; - auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + case PROJECTOR_TYPE_GLM_EDGE: + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + //clip_image_save_to_bmp(resized_image, "resized.bmp"); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; - int h_bar = std::max(total_factor, round_by_factor(height)); - int w_bar = std::max(total_factor, round_by_factor(width)); + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); + clip_image_u8 resized_image; + // the original pixtral model doesn't have n_merge + const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * cur_merge, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; - if (h_bar * w_bar > max_pixels) { - const auto beta = std::sqrt((height * width) / max_pixels); - h_bar = std::max(total_factor, floor_by_factor(height / beta)); - w_bar = std::max(total_factor, floor_by_factor(width / beta)); - } else if (h_bar * w_bar < min_pixels) { - const auto beta = std::sqrt(min_pixels / (height * width)); - h_bar = ceil_by_factor(height * beta); - w_bar = ceil_by_factor(width * beta); - } + case PROJECTOR_TYPE_LLAMA4: + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - const std::array pad_color = {122, 116, 104}; + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; - clip_image_u8 resized_img; - image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - return true; - } + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + { + GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_min_pixels, + params.image_max_pixels); + const std::array pad_color = {122, 116, 104}; + + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm? + { + // TODO @ngxson : refactor the code below to avoid duplicated logic - clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - if (pad_to_square) { - // for llava-1.5, we resize image to a square, and pad the shorter side with a background color - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - const int longer_side = std::max(img->nx, img->ny); - temp->nx = longer_side; - temp->ny = longer_side; - temp->buf.resize(3 * longer_side * longer_side); + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily - // background color in RGB from LLaVA (this is the mean rgb color * 255) - const std::array pad_color = {122, 116, 104}; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.image_res_candidates.empty()) { // pad_to_square + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); + temp->nx = longer_side; + temp->ny = longer_side; + temp->buf.resize(3 * longer_side * longer_side); - // resize the image to the target_size - image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color); + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array pad_color = {122, 116, 104}; - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - return true; + // resize the image to the target_size + img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); - } else if (!params.image_res_candidates.empty()) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } + } else { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + } + } break; - return true; - } else { - GGML_ABORT("Unknown image preprocessing type"); + default: + LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); + return false; } + return true; } ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { @@ -4145,7 +4247,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * const auto & params = ctx->model.hparams; const int n_total = clip_n_output_tokens(ctx, img); if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + return img->nx / (params.patch_size * 2); } return n_total; } @@ -4153,7 +4255,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + return img->ny / (params.patch_size * 2); } return 1; } @@ -4211,9 +4313,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN3VL: { // dynamic size (2 conv, so double patch size) - int patch_size = params.patch_size * 2; - int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); - int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); + int x_patch = img->nx / (params.patch_size * 2); + int y_patch = img->ny / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: @@ -4222,15 +4323,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LLAMA4: { // both X and Y are downscaled by the scale factor - int scale_factor = ctx->model.hparams.proj_scale_factor; + int scale_factor = ctx->model.hparams.n_merge; n_patches /= (scale_factor * scale_factor); } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { // dynamic size - int scale_factor = ctx->model.hparams.proj_scale_factor; - int out_patch_size = params.patch_size * scale_factor; + int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; n_patches = x_patch * y_patch; @@ -4239,7 +4339,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LIGHTONOCR: { // dynamic size - int n_merge = params.spatial_merge_size; + int n_merge = ctx->model.hparams.n_merge; int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 026b53b28632f..b71690cc81e29 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte index e878e7bf8a217..d8f5630fd14f7 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte @@ -3,7 +3,16 @@ import { useProcessingState } from '$lib/hooks/use-processing-state.svelte'; import { isLoading } from '$lib/stores/chat.svelte'; import { fade } from 'svelte/transition'; - import { Check, Copy, Package, X } from '@lucide/svelte'; + import { + Check, + Copy, + Package, + X, + Gauge, + Clock, + WholeWord, + ChartNoAxesColumn + } from '@lucide/svelte'; import { Button } from '$lib/components/ui/button'; import { Checkbox } from '$lib/components/ui/checkbox'; import { INPUT_CLASSES } from '$lib/constants/input-classes'; @@ -76,8 +85,8 @@ let displayedModel = $derived((): string | null => { if (!currentConfig.showModelInfo) return null; - if (currentConfig.modelSelectorEnabled) { - return message.model ?? null; + if (message.model) { + return message.model; } return serverModel; @@ -160,22 +169,58 @@ {/if} - {#if displayedModel()} - - +
+ {#if displayedModel()} + + + - Model used: + Model used: + - - - {/if} + + + + {/if} + + {#if currentConfig.showMessageStats && message.timings && message.timings.predicted_n && message.timings.predicted_ms} + {@const tokensPerSecond = (message.timings.predicted_n / message.timings.predicted_ms) * 1000} + + + + + Statistics: + + +
+ + + {tokensPerSecond.toFixed(2)} tokens/s + + + + {message.timings.predicted_n} tokens + + + + {(message.timings.predicted_ms / 1000).toFixed(2)}s + +
+
+ {/if} +
{#if message.timestamp && !isEditing} + import { Dialog as DialogPrimitive } from 'bits-ui'; + import XIcon from '@lucide/svelte/icons/x'; + + interface Props { + open: boolean; + code: string; + language: string; + onOpenChange?: (open: boolean) => void; + } + + let { open = $bindable(), code, language, onOpenChange }: Props = $props(); + + let iframeRef = $state(null); + + $effect(() => { + if (!iframeRef) return; + + if (open) { + iframeRef.srcdoc = code; + } else { + iframeRef.srcdoc = ''; + } + }); + + function handleOpenChange(nextOpen: boolean) { + open = nextOpen; + onOpenChange?.(nextOpen); + } + + + + + + + + + + + + Close preview + + + + + + diff --git a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte index 1f4caa9003bce..1c069db58d830 100644 --- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte @@ -15,6 +15,7 @@ import githubLightCss from 'highlight.js/styles/github.css?inline'; import { mode } from 'mode-watcher'; import { remarkLiteralHtml } from '$lib/markdown/literal-html'; + import CodePreviewDialog from './CodePreviewDialog.svelte'; interface Props { content: string; @@ -25,6 +26,9 @@ let containerRef = $state(); let processedHtml = $state(''); + let previewDialogOpen = $state(false); + let previewCode = $state(''); + let previewLanguage = $state('text'); function loadHighlightTheme(isDark: boolean) { if (!browser) return; @@ -117,7 +121,6 @@ const rawCode = codeElement.textContent || ''; const codeId = `code-${Date.now()}-${index}`; - codeElement.setAttribute('data-code-id', codeId); codeElement.setAttribute('data-raw-code', rawCode); @@ -138,11 +141,30 @@ copyButton.setAttribute('type', 'button'); copyButton.innerHTML = ` - - `; + + `; + + const actions = document.createElement('div'); + actions.className = 'code-block-actions'; + + actions.appendChild(copyButton); + + if (language.toLowerCase() === 'html') { + const previewButton = document.createElement('button'); + previewButton.className = 'preview-code-btn'; + previewButton.setAttribute('data-code-id', codeId); + previewButton.setAttribute('title', 'Preview code'); + previewButton.setAttribute('type', 'button'); + + previewButton.innerHTML = ` + + `; + + actions.appendChild(previewButton); + } header.appendChild(languageLabel); - header.appendChild(copyButton); + header.appendChild(actions); wrapper.appendChild(header); const clonedPre = pre.cloneNode(true) as HTMLElement; @@ -180,49 +202,105 @@ } } - function setupCopyButtons() { - if (!containerRef) return; + function getCodeInfoFromTarget(target: HTMLElement) { + const wrapper = target.closest('.code-block-wrapper'); - const copyButtons = containerRef.querySelectorAll('.copy-code-btn'); + if (!wrapper) { + console.error('No wrapper found'); + return null; + } - for (const button of copyButtons) { - button.addEventListener('click', async (e) => { - e.preventDefault(); - e.stopPropagation(); + const codeElement = wrapper.querySelector('code[data-code-id]'); - const target = e.currentTarget as HTMLButtonElement; - const codeId = target.getAttribute('data-code-id'); + if (!codeElement) { + console.error('No code element found in wrapper'); + return null; + } - if (!codeId) { - console.error('No code ID found on button'); - return; - } + const rawCode = codeElement.getAttribute('data-raw-code'); - // Find the code element within the same wrapper - const wrapper = target.closest('.code-block-wrapper'); - if (!wrapper) { - console.error('No wrapper found'); - return; - } + if (rawCode === null) { + console.error('No raw code found'); + return null; + } - const codeElement = wrapper.querySelector('code[data-code-id]'); - if (!codeElement) { - console.error('No code element found in wrapper'); - return; - } + const languageLabel = wrapper.querySelector('.code-language'); + const language = languageLabel?.textContent?.trim() || 'text'; - const rawCode = codeElement.getAttribute('data-raw-code'); - if (!rawCode) { - console.error('No raw code found'); - return; - } + return { rawCode, language }; + } - try { - await copyCodeToClipboard(rawCode); - } catch (error) { - console.error('Failed to copy code:', error); - } - }); + async function handleCopyClick(event: Event) { + event.preventDefault(); + event.stopPropagation(); + + const target = event.currentTarget as HTMLButtonElement | null; + + if (!target) { + return; + } + + const info = getCodeInfoFromTarget(target); + + if (!info) { + return; + } + + try { + await copyCodeToClipboard(info.rawCode); + } catch (error) { + console.error('Failed to copy code:', error); + } + } + + function handlePreviewClick(event: Event) { + event.preventDefault(); + event.stopPropagation(); + + const target = event.currentTarget as HTMLButtonElement | null; + + if (!target) { + return; + } + + const info = getCodeInfoFromTarget(target); + + if (!info) { + return; + } + + previewCode = info.rawCode; + previewLanguage = info.language; + previewDialogOpen = true; + } + + function setupCodeBlockActions() { + if (!containerRef) return; + + const wrappers = containerRef.querySelectorAll('.code-block-wrapper'); + + for (const wrapper of wrappers) { + const copyButton = wrapper.querySelector('.copy-code-btn'); + const previewButton = wrapper.querySelector('.preview-code-btn'); + + if (copyButton && copyButton.dataset.listenerBound !== 'true') { + copyButton.dataset.listenerBound = 'true'; + copyButton.addEventListener('click', handleCopyClick); + } + + if (previewButton && previewButton.dataset.listenerBound !== 'true') { + previewButton.dataset.listenerBound = 'true'; + previewButton.addEventListener('click', handlePreviewClick); + } + } + } + + function handlePreviewDialogOpenChange(open: boolean) { + previewDialogOpen = open; + + if (!open) { + previewCode = ''; + previewLanguage = 'text'; } } @@ -243,7 +321,7 @@ $effect(() => { if (containerRef && processedHtml) { - setupCopyButtons(); + setupCodeBlockActions(); } }); @@ -253,6 +331,13 @@ {@html processedHtml} + +